geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit f0c091b153051a042f4763fd0eb5da97bc41bd27
parent 0b4062fe6319107c42778b324aced2bee6b94444
Author: René Wagner <rwa@clttr.info>
Date:   Sun,  6 Feb 2022 19:13:30 +0100

some sql adjustments

Diffstat:
Mgus/crawl.py | 2+-
Mgus/lib/index_statistics.py | 8++++----
Mserve/models.py | 20++++++++++++--------
3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -512,7 +512,7 @@ def crawl_page( def load_expired_urls(): expired_pages = Page.raw( - """SELECT url + """SELECT p.url FROM page as p WHERE datetime(last_crawl_at, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') OR last_crawl_at IS NULL""" ) return [page.url for page in expired_pages.execute()] diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -10,11 +10,11 @@ from gus.lib.db_model import Page def compute_index_statistics(db): page_count = len(Page.raw("""SELECT DISTINCT p.id FROM page AS p -WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""").dicts()) +WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""").dicts()) domains_query = Page.raw("""SELECT DISTINCT p.domain, p.port FROM page AS p -WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""") +WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20""") domains = [] for d in domains_query.execute(): s = d.domain @@ -32,12 +32,12 @@ WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL""") content_type_frequencies = (Page.raw("""SELECT p.content_type, count(p.content_type) as 'count' FROM page AS p -WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL +WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 GROUP BY p.content_type ORDER BY 2 desc""").dicts()) charset_frequencies = (Page.raw("""SELECT upper(p.charset), count(p.id) as 'count' FROM page AS p -WHERE last_success_status == 20 AND last_crawl_success_at IS NOT NULL AND p.charset IS NOT NULL +WHERE last_crawl_success_at IS NOT NULL AND last_success_status == 20 AND p.charset IS NOT NULL GROUP BY upper(p.charset) ORDER BY 2 desc""").dicts()) index_modification_time = Page.select(fn.Max(Page.last_crawl_at)).scalar() diff --git a/serve/models.py b/serve/models.py @@ -26,8 +26,8 @@ class GUS: SELECT DISTINCT p.domain FROM page AS p WHERE last_crawl_success_at IS NOT NULL - AND last_status = 20 - ORDER BY p.domain + AND last_status = 20 + ORDER BY p.domain """ ) self.hosts = hosts_query.execute() @@ -36,8 +36,8 @@ class GUS: SELECT p.domain, p.first_seen_at FROM page AS p WHERE last_crawl_success_at IS NOT NULL - AND first_seen_at IS NOT NULL - AND last_status = 20 + AND last_status = 20 + AND first_seen_at IS NOT NULL GROUP BY p.domain ORDER BY first_seen_at DESC LIMIT 30 @@ -47,19 +47,23 @@ class GUS: newest_pages_query = Page.raw( """SELECT p.url, p.fetchable_url, p.first_seen_at FROM page as p - WHERE last_crawl_success_at IS NOT NULL AND first_seen_at IS NOT NULL + WHERE last_crawl_success_at IS NOT NULL + AND last_status = 20 + AND first_seen_at IS NOT NULL ORDER BY first_seen_at DESC LIMIT 50""") self.newest_pages = newest_pages_query.execute() feeds_query = Page.raw( - """SELECT DISTINCT p.* + """SELECT DISTINCT p.url, p.fetchable_url FROM page AS p - WHERE (p.url LIKE '%atom.xml' + WHERE p.last_crawl_success_at IS NOT NULL + AND last_status = 20 + AND p.url LIKE '%atom.xml' OR p.url LIKE '%feed.xml' OR p.url LIKE '%.rss' OR p.url LIKE '%.atom' OR p.content_type IN ('application/atom+xml', 'application/rss+xml')) - AND p.last_crawl_success_at IS NOT NULL""") + """) self.feeds = feeds_query.execute()