geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 43397bdda337c7a0f31019358bd56eb0ae87a993
parent 5eebbbfc00555da619054e8129ad70bf3de99fd5
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Fri,  6 Nov 2020 08:42:57 -0500

Reformat code with Black

Diffstat:
Mgus/__init__.py | 2+-
Mgus/build_index.py | 104+++++++++++++++++++++++++++++++++++++-------------------------------------------
Mgus/crawl.py | 346++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Mgus/lib/db_model.py | 25+++++++++++++++++--------
Mgus/lib/domain.py | 1+
Mgus/lib/gemini.py | 151++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Mgus/lib/logging.py | 15+++++++--------
Mgus/lib/misc.py | 25+++++++++++++++++--------
Mgus/lib/whoosh_extensions.py | 23+++++++++++++++--------
Mserve/constants.py | 6+-----
Mserve/main.py | 20++++----------------
Mserve/models.py | 156++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mserve/views.py | 186++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mtests/gus/lib/test_gemini.py | 29+++++++++++++++++------------
14 files changed, 685 insertions(+), 404 deletions(-)

diff --git a/gus/__init__.py b/gus/__init__.py @@ -1 +1 @@ -__version__ = '0.1.0' +__version__ = "0.1.0" diff --git a/gus/build_index.py b/gus/build_index.py @@ -13,7 +13,11 @@ from whoosh.index import open_dir from gus.crawl import EXCLUDED_URL_PREFIXES from gus.lib.db_model import init_db, Page -from gus.lib.index_statistics import compute_index_statistics, persist_statistics, log_index_statistics +from gus.lib.index_statistics import ( + compute_index_statistics, + persist_statistics, + log_index_statistics, +) from gus.lib.whoosh_extensions import UrlAnalyzer import gus.lib.logging @@ -39,42 +43,16 @@ def create_index(index_dir): # shutil.rmtree(index_dir, ignore_errors=True) pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) schema = Schema( - url_id=ID( - unique=True, - ), - url=TEXT( - field_boost=2.0, - stored=True, - analyzer=UrlAnalyzer(), - ), + url_id=ID(unique=True,), + url=TEXT(field_boost=2.0, stored=True, analyzer=UrlAnalyzer(),), fetchable_url=STORED(), - domain=TEXT( - analyzer=UrlAnalyzer(), - ), - port=NUMERIC( - int, - 32, - signed=False, - stored=True, - ), - content_type=TEXT( - stored=True, - ), - charset=ID( - stored=True, - ), - lang=ID( - stored=True, - ), - content=TEXT( - analyzer=FancyAnalyzer(), - spelling=True, - stored=True, - ), - prompt=TEXT( - analyzer=FancyAnalyzer(), - stored=True, - ), + domain=TEXT(analyzer=UrlAnalyzer(),), + port=NUMERIC(int, 32, signed=False, stored=True,), + content_type=TEXT(stored=True,), + charset=ID(stored=True,), + lang=ID(stored=True,), + content=TEXT(analyzer=FancyAnalyzer(), spelling=True, stored=True,), + prompt=TEXT(analyzer=FancyAnalyzer(), stored=True,), size=NUMERIC( int, # this means GUS will have problems indexing responses over ~2GB @@ -83,14 +61,9 @@ def create_index(index_dir): stored=True, ), backlink_count=NUMERIC( - int, - 16, # num bits, so max value is 65k - signed=False, - stored=True, - ), - indexed_at=DATETIME( - stored=True, + int, 16, signed=False, stored=True, # num bits, so max value is 65k ), + indexed_at=DATETIME(stored=True,), ) index_storage.create_index(schema) @@ -102,16 +75,23 @@ def index_page(page, indexed_urls): should_skip = True break if should_skip: - logging.debug('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(page.url)) + logging.debug( + "URL prefix matches exclusion list, skipping: %s", + gus.lib.logging.strip_control_chars(page.url), + ) return False if page.fetchable_url in indexed_urls: - logging.debug('Page already indexed, skipping: %s', gus.lib.logging.strip_control_chars(page.url)) + logging.debug( + "Page already indexed, skipping: %s", + gus.lib.logging.strip_control_chars(page.url), + ) return False logging.info("Indexing page: %s", gus.lib.logging.strip_control_chars(page.url)) u = page.url.rstrip("/") - external_backlinks = Page.raw("""SELECT p_from.url + external_backlinks = Page.raw( + """SELECT p_from.url FROM page AS p_from JOIN indexable_crawl AS ic ON ic.page_id == p_from.id @@ -121,7 +101,10 @@ JOIN page as p_to ON p_to.id == l.to_page_id WHERE p_to.url IN (?, ?) AND l.is_cross_host_like == 1 -GROUP BY p_from.normalized_url""", u, f"{u}/") +GROUP BY p_from.normalized_url""", + u, + f"{u}/", + ) backlink_urls = [b.url for b in external_backlinks.execute()] backlink_count = len(backlink_urls) @@ -146,9 +129,12 @@ GROUP BY p_from.normalized_url""", u, f"{u}/") index_writer.add_document(**document) return True except: - logging.warn("Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url)) + logging.warn( + "Failed to index page: %s", gus.lib.logging.strip_control_chars(page.url) + ) return False + def load_indexed_urls(index_dir): indexed_urls = [] ix = open_dir(index_dir) @@ -162,7 +148,9 @@ def load_indexed_urls(index_dir): def invalidate_recent_results(invalidation_window): recency_minimum = datetime.now() - timedelta(hours=invalidation_window) - pages = Page.select().where(Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum) + pages = Page.select().where( + Page.indexed_at.is_null(False), Page.indexed_at > recency_minimum + ) for page in pages: index_writer.delete_by_term("url_id", page.url, searcher=None) @@ -183,13 +171,17 @@ def build_index(should_run_destructive=False, invalidation_window=0): index_writer = ix.writer() invalidate_recent_results(invalidation_window) - indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) + indexed_urls = ( + [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) + ) - pages = Page.raw("""SELECT p.*, MAX(c.timestamp) AS crawl_timestamp + pages = Page.raw( + """SELECT p.*, MAX(c.timestamp) AS crawl_timestamp FROM indexable_crawl AS c JOIN page AS p ON p.id == c.page_id -GROUP BY p.normalized_url""") +GROUP BY p.normalized_url""" + ) i = 0 for page in pages.iterator(): @@ -202,10 +194,10 @@ GROUP BY p.normalized_url""") # it to flush segments to disk every 5000 documents, which # should scale well with Geminispace going forward. if i % 5000 == 0: - logging.debug('Committing index.') + logging.debug("Committing index.") index_writer.commit() index_writer = ix.writer() - logging.debug('Committing index for the last time.') + logging.debug("Committing index for the last time.") index_writer.commit() index_statistics = compute_index_statistics(db) @@ -216,7 +208,7 @@ GROUP BY p.normalized_url""") # shutil.rmtree(INDEX_DIR_CURRENT, ignore_errors=True) # shutil.move(INDEX_DIR_NEW, INDEX_DIR_CURRENT) - logging.info('Finished!') + logging.info("Finished!") def main(): @@ -226,7 +218,7 @@ def main(): def parse_args(): - parser = argparse.ArgumentParser(description='Crawl Geminispace.') + parser = argparse.ArgumentParser(description="Crawl Geminispace.") parser.add_argument( "--destructive", "-d", diff --git a/gus/crawl.py b/gus/crawl.py @@ -155,7 +155,10 @@ CRAWL_DELAYS = { def index_binary(resource, response): - logging.debug('Indexing binary for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) + logging.debug( + "Indexing binary for: %s", + gus.lib.logging.strip_control_chars(resource.indexable_url), + ) doc = { "url": resource.indexable_url, @@ -171,15 +174,23 @@ def index_binary(resource, response): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("binary") - doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "binary") + existing_change_frequency = ( + existing_page.change_frequency + or resource.get_default_change_frequency("binary") + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, "binary" + ) page = Page(**doc) page.save() return page def index_redirect(resource): - logging.debug('Indexing redirect for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) + logging.debug( + "Indexing redirect for: %s", + gus.lib.logging.strip_control_chars(resource.indexable_url), + ) doc = { "url": resource.indexable_url, @@ -192,15 +203,23 @@ def index_redirect(resource): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("redirect") - doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "redirect") + existing_change_frequency = ( + existing_page.change_frequency + or resource.get_default_change_frequency("redirect") + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, "redirect" + ) page = Page(**doc) page.save() return page def index_error(resource, is_temporary): - logging.debug('Indexing error for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) + logging.debug( + "Indexing error for: %s", + gus.lib.logging.strip_control_chars(resource.indexable_url), + ) category = "temp_error" if is_temporary else "perm_error" default_change_frequency = resource.get_default_change_frequency(category) @@ -215,16 +234,22 @@ def index_error(resource, is_temporary): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - existing_change_frequency = existing_page.change_frequency or default_change_frequency - doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, category) + existing_change_frequency = ( + existing_page.change_frequency or default_change_frequency + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, category + ) page = Page(**doc) page.save() return page - def index_prompt(resource, response): - logging.debug('Indexing prompt for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) + logging.debug( + "Indexing prompt for: %s", + gus.lib.logging.strip_control_chars(resource.indexable_url), + ) doc = { "url": resource.indexable_url, @@ -241,15 +266,23 @@ def index_prompt(resource, response): existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id - existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("prompt") - doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "prompt") + existing_change_frequency = ( + existing_page.change_frequency + or resource.get_default_change_frequency("prompt") + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, "prompt" + ) page = Page(**doc) page.save() return page def index_content(resource, response): - logging.debug('Indexing content for: %s', gus.lib.logging.strip_control_chars(resource.indexable_url)) + logging.debug( + "Indexing content for: %s", + gus.lib.logging.strip_control_chars(resource.indexable_url), + ) doc = { "url": resource.indexable_url, @@ -264,7 +297,7 @@ def index_content(resource, response): "change_frequency": resource.get_default_change_frequency("content"), } if response.content_type == "text/gemini": - doc["lang"] = response.lang or "none", + doc["lang"] = (response.lang or "none",) existing_page = Page.get_or_none(url=resource.indexable_url) is_different = False if existing_page: @@ -272,10 +305,17 @@ def index_content(resource, response): if existing_page.content: is_different = doc["content"] != existing_page.content if is_different: - doc["change_frequency"] = resource.get_default_change_frequency("content") + doc["change_frequency"] = resource.get_default_change_frequency( + "content" + ) else: - existing_change_frequency = existing_page.change_frequency or resource.get_default_change_frequency("content") - doc["change_frequency"] = resource.increment_change_frequency(existing_change_frequency, "content") + existing_change_frequency = ( + existing_page.change_frequency + or resource.get_default_change_frequency("content") + ) + doc["change_frequency"] = resource.increment_change_frequency( + existing_change_frequency, "content" + ) page = Page(**doc) page.save() return page, is_different @@ -301,17 +341,21 @@ def index_links(from_resource, contained_resources): domain=cr.normalized_host, port=cr.urlsplit.port or 1965, ) - data.append({ - "from_page": from_page, - "to_page": to_page, - "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr), - }) + data.append( + { + "from_page": from_page, + "to_page": to_page, + "is_cross_host_like": Link.get_is_cross_host_like(from_resource, cr), + } + ) Link.insert_many(data).execute() def fetch_robots_file(robot_host): robot_url = urljoin("gemini://{}".format(robot_host), "/robots.txt") - logging.info('Fetching robots file: %s', gus.lib.logging.strip_control_chars(robot_url)) + logging.info( + "Fetching robots file: %s", gus.lib.logging.strip_control_chars(robot_url) + ) rp = GeminiRobotFileParser(robot_url) rp.read() @@ -322,30 +366,52 @@ def get_robots_file(robot_host): return robot_file_map[robot_host] -def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]): +def crawl_page( + gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[] +): gr = gemini_resource url = gr.fetchable_url if max_crawl_depth >= 0 and current_depth > max_crawl_depth: - logging.warn('Going too deep, skipping: %s', gus.lib.logging.strip_control_chars(url)) + logging.warn( + "Going too deep, skipping: %s", gus.lib.logging.strip_control_chars(url) + ) return if not gemini_resource.is_valid: - logging.warn('Not a valid gemini resource, skipping: %s', gus.lib.logging.strip_control_chars(url)) + logging.warn( + "Not a valid gemini resource, skipping: %s", + gus.lib.logging.strip_control_chars(url), + ) return for excluded_prefix in EXCLUDED_URL_PREFIXES: if gr.normalized_url.startswith(excluded_prefix): - logging.info('URL prefix matches exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url)) + logging.info( + "URL prefix matches exclusion list, skipping: %s", + gus.lib.logging.strip_control_chars(url), + ) return for excluded_path in EXCLUDED_URL_PATHS: if gr.urlsplit.path.lower().endswith(excluded_path): - logging.info('URL on exclusion list, skipping: %s', gus.lib.logging.strip_control_chars(url)) + logging.info( + "URL on exclusion list, skipping: %s", + gus.lib.logging.strip_control_chars(url), + ) return if should_check_if_expired: existing_page = Page.get_or_none(url=gr.indexable_url) if existing_page and existing_page.change_frequency is not None: - most_recent_crawl = Crawl.select(peewee.fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar() - if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency): - logging.debug('Recrawling too soon, skipping: %s', gus.lib.logging.strip_control_chars(gr.fetchable_url)) + most_recent_crawl = ( + Crawl.select(peewee.fn.MAX(Crawl.timestamp)) + .where(Crawl.page == existing_page) + .scalar() + ) + if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta( + hours=existing_page.change_frequency + ): + logging.debug( + "Recrawling too soon, skipping: %s", + gus.lib.logging.strip_control_chars(gr.fetchable_url), + ) return # ROBOTS @@ -365,120 +431,188 @@ def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, red crawl_delay = robots_file.crawl_delay("gus") if not can_fetch: - logging.debug('Blocked by robots files, skipping: %s', gus.lib.logging.strip_control_chars(url)) + logging.debug( + "Blocked by robots files, skipping: %s", + gus.lib.logging.strip_control_chars(url), + ) return # Crawl delay if gr.normalized_host in domain_hit_timings: if gr.normalized_host in CRAWL_DELAYS: - next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=CRAWL_DELAYS[gr.normalized_host]) + next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( + milliseconds=CRAWL_DELAYS[gr.normalized_host] + ) elif not crawl_delay: - next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=500) + next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( + milliseconds=500 + ) else: - next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta(milliseconds=crawl_delay) + next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( + milliseconds=crawl_delay + ) sleep_duration = max((next_allowed_hit - datetime.now()).total_seconds(), 0) time.sleep(sleep_duration) domain_hit_timings[gr.normalized_host] = datetime.now() # Actually fetch! - logging.info('Fetching resource: %s', gus.lib.logging.strip_control_chars(url)) + logging.info("Fetching resource: %s", gus.lib.logging.strip_control_chars(url)) if gr.fully_qualified_parent_url is not None: - logging.debug('with parent: %s', gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url)) + logging.debug( + "with parent: %s", + gus.lib.logging.strip_control_chars(gr.fully_qualified_parent_url), + ) response = gr.fetch() if response is None: # problem before getting a response - logging.warn('Failed to fetch: %s', gus.lib.logging.strip_control_chars(url)) + logging.warn("Failed to fetch: %s", gus.lib.logging.strip_control_chars(url)) page = index_error(gr, True) - page_crawl = Crawl(page=page, - status=0, - is_different=False, - timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, status=0, is_different=False, timestamp=datetime.utcnow() + ) page_crawl.save() elif response.status.startswith("4"): # temporary error status - logging.debug('Got temporary error: %s: %s %s', - gus.lib.logging.strip_control_chars(url), - response.status, - response.error_message) + logging.debug( + "Got temporary error: %s: %s %s", + gus.lib.logging.strip_control_chars(url), + response.status, + response.error_message, + ) page = index_error(gr, True) - page_crawl = Crawl(page=page, - status=response.status, - is_different=False, - error_message=response.error_message, - timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=False, + error_message=response.error_message, + timestamp=datetime.utcnow(), + ) page_crawl.save() elif response.status.startswith("5"): # permanent error status - logging.debug('Got permanent error: %s: %s %s', - gus.lib.logging.strip_control_chars(url), - response.status, - response.error_message) + logging.debug( + "Got permanent error: %s: %s %s", + gus.lib.logging.strip_control_chars(url), + response.status, + response.error_message, + ) page = index_error(gr, False) - page_crawl = Crawl(page=page, - status=response.status, - is_different=False, - error_message=response.error_message, - timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=False, + error_message=response.error_message, + timestamp=datetime.utcnow(), + ) page_crawl.save() elif response.status.startswith("3"): # redirect status - logging.debug('Got redirected: %s: %s %s', - gus.lib.logging.strip_control_chars(url), - response.status, - response.url) + logging.debug( + "Got redirected: %s: %s %s", + gus.lib.logging.strip_control_chars(url), + response.status, + response.url, + ) if len(redirect_chain) > constants.MAXIMUM_REDIRECT_CHAIN_LENGTH: - logging.info('Aborting, maximum redirect chain length reached: %s', gus.lib.logging.strip_control_chars(url)) + logging.info( + "Aborting, maximum redirect chain length reached: %s", + gus.lib.logging.strip_control_chars(url), + ) return - redirect_resource = GeminiResource(response.url, gr.normalized_url, gr.normalized_host) + redirect_resource = GeminiResource( + response.url, gr.normalized_url, gr.normalized_host + ) if redirect_resource.fetchable_url == gr.fetchable_url: - logging.info('Aborting, redirecting to self: %s', gus.lib.logging.strip_control_chars(url)) + logging.info( + "Aborting, redirecting to self: %s", + gus.lib.logging.strip_control_chars(url), + ) return page = index_redirect(gr) - page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=False, + timestamp=datetime.utcnow(), + ) page_crawl.save() index_links(gr, [redirect_resource]) - crawl_page(redirect_resource, current_depth, should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url]) + crawl_page( + redirect_resource, + current_depth, + should_check_if_expired=True, + redirect_chain=redirect_chain + [gr.fetchable_url], + ) elif response.status.startswith("1"): # input status - logging.debug('Input requested at: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.prompt) + logging.debug( + "Input requested at: %s: %s %s", + gus.lib.logging.strip_control_chars(url), + response.status, + response.prompt, + ) page = index_prompt(gr, response) - page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=False, + timestamp=datetime.utcnow(), + ) page_crawl.save() elif response.status.startswith("2"): # success status - logging.debug('Successful request: %s: %s %s', gus.lib.logging.strip_control_chars(url), response.status, response.content_type) + logging.debug( + "Successful request: %s: %s %s", + gus.lib.logging.strip_control_chars(url), + response.status, + response.content_type, + ) if response.content_type.startswith("text/"): page, is_different = index_content(gr, response) page_crawl = Crawl( page=page, status=response.status, is_different=is_different, - timestamp=datetime.utcnow() + timestamp=datetime.utcnow(), ) page_crawl.save() if response.content_type != "text/gemini": - logging.debug('Content is not gemini text: %s: %s', - gus.lib.logging.strip_control_chars(url), response.content_type) + logging.debug( + "Content is not gemini text: %s: %s", + gus.lib.logging.strip_control_chars(url), + response.content_type, + ) else: - logging.debug('Got gemini text, extracting and crawling links: %s', - gus.lib.logging.strip_control_chars(url)) + logging.debug( + "Got gemini text, extracting and crawling links: %s", + gus.lib.logging.strip_control_chars(url), + ) contained_resources = gr.extract_contained_resources(response.content) index_links(gr, contained_resources) for resource in contained_resources: - crawl_page(resource, current_depth+1, should_check_if_expired=True) + crawl_page( + resource, current_depth + 1, should_check_if_expired=True + ) else: page = index_binary(gr, response) - page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=False, + timestamp=datetime.utcnow(), + ) page_crawl.save() else: - logging.warn('Got unhandled status: %s: %s', - gus.lib.logging.strip_control_chars(url), - response.status) + logging.warn( + "Got unhandled status: %s: %s", + gus.lib.logging.strip_control_chars(url), + response.status, + ) def pickle_robot_file_map(robot_file_map, index_dir): @@ -487,13 +621,14 @@ def pickle_robot_file_map(robot_file_map, index_dir): def unpickle_robot_file_map(index_dir): if not os.path.isfile(index_dir + "/robot_file_map.p"): - logging.debug('Robot file cache missing') + logging.debug("Robot file cache missing") return {} return pickle.load(open(index_dir + "/robot_file_map.p", "rb")) def load_expired_urls(): - expired_pages = Page.raw("""SELECT url + expired_pages = Page.raw( + """SELECT url FROM ( SELECT p.url, p.normalized_url, p.change_frequency, MAX(c.timestamp) as timestamp FROM page as p @@ -502,7 +637,8 @@ FROM ( GROUP BY p.url ) WHERE datetime(timestamp, REPLACE('fnord hours', 'fnord', change_frequency)) < datetime('now') -GROUP BY normalized_url;""") +GROUP BY normalized_url;""" + ) return [page.url for page in expired_pages.execute()] @@ -528,7 +664,10 @@ def load_feed_urls(filename): def items_from_feed_string(feed_str): feed_obj = feedparser.parse(feed_str) feed = feed_obj.feed - return [(entry.updated_parsed, entry.link, entry.title, feed.title) for entry in feed_obj.entries] + return [ + (entry.updated_parsed, entry.link, entry.title, feed.title) + for entry in feed_obj.entries + ] def resolve_feed_content_urls(feed_file=constants.FEED_FILE): @@ -550,26 +689,29 @@ def resolve_feed_content_urls(feed_file=constants.FEED_FILE): now = time.time() interval = int(now - last) if interval < 5: - logging.warn('Declining to hit %s again after only %d seconds', - gus.lib.logging.strip_control_chars(feed_resource.normalized_host), - interval) + logging.warn( + "Declining to hit %s again after only %d seconds", + gus.lib.logging.strip_control_chars(feed_resource.normalized_host), + interval, + ) feed_urls.insert(0, feed_url) skips += 1 if skips == len(feed_urls): # We've hammered every server in the queue! Sleep a bit... - logging.warn('Sleeping to give all servers a rest!') + logging.warn("Sleeping to give all servers a rest!") time.sleep(5) continue skips = 0 # Good to go - logging.info('Fetching feed: %s', - gus.lib.logging.strip_control_chars(feed_url)) + logging.info("Fetching feed: %s", gus.lib.logging.strip_control_chars(feed_url)) try: resp = feed_resource.fetch() except: - logging.info('Error fetching feed, skipping: %s', - gus.lib.logging.strip_control_chars(feed_url)) + logging.info( + "Error fetching feed, skipping: %s", + gus.lib.logging.strip_control_chars(feed_url), + ) continue if resp and resp.status == "20": last_accessed[feed_resource.normalized_host] = time.time() @@ -595,8 +737,10 @@ def recrawl_feeds(): crawl_page(resource, 0) pickle_robot_file_map(robot_file_map, index_dir) - logging.debug('Recrawled feeds: %s', gus.lib.logging.strip_control_chars(content_urls)) - logging.info('Finished!') + logging.debug( + "Recrawled feeds: %s", gus.lib.logging.strip_control_chars(content_urls) + ) + logging.info("Finished!") def run_crawl(should_run_destructive=False, seed_urls=[]): @@ -609,7 +753,9 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): db = init_db(f"{index_dir}/{constants.DB_FILENAME}") global robot_file_map - robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) + robot_file_map = ( + {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) + ) global domain_hit_timings domain_hit_timings = {} global max_crawl_depth @@ -623,7 +769,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): crawl_page(resource, 0, should_check_if_expired=True) pickle_robot_file_map(robot_file_map, index_dir) - logging.info('Finished!') + logging.info("Finished!") def main(): @@ -637,7 +783,7 @@ def main(): def parse_args(): - parser = argparse.ArgumentParser(description='Crawl Geminispace.') + parser = argparse.ArgumentParser(description="Crawl Geminispace.") parser.add_argument( "--destructive", "-d", diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -12,6 +12,7 @@ from peewee import ( from gus.lib.gemini import GeminiResource + def init_db(filename=":memory:"): """ Bind an SQLite database to the Peewee ORM models. @@ -20,13 +21,15 @@ def init_db(filename=":memory:"): db = SqliteDatabase(filename) db.bind(models) db.create_tables(models) - db.execute_sql("""CREATE VIEW IF NOT EXISTS indexable_crawl AS + db.execute_sql( + """CREATE VIEW IF NOT EXISTS indexable_crawl AS SELECT c.* FROM ( SELECT crawl.*, row_number() OVER (PARTITION BY page_id ORDER BY timestamp DESC) AS rank FROM crawl) AS c WHERE c.rank < 3 -AND c.status == 20;""") +AND c.status == 20;""" + ) return db @@ -46,33 +49,36 @@ class Page(Model): lang = TextField(null=True) content = TextField(null=True) prompt = TextField(null=True) - size = IntegerField(null=True) # in bytes - change_frequency = IntegerField(null=True) # in hours + size = IntegerField(null=True) # in bytes + change_frequency = IntegerField(null=True) # in hours indexed_at = DateTimeField(null=True) + class Link(Model): """ Hyperlinks between pages in Geminispace """ - from_page = ForeignKeyField(Page, backref="outbound_links", on_delete='CASCADE') - to_page = ForeignKeyField(Page, backref="backlinks", on_delete='CASCADE') + from_page = ForeignKeyField(Page, backref="outbound_links", on_delete="CASCADE") + to_page = ForeignKeyField(Page, backref="backlinks", on_delete="CASCADE") is_cross_host_like = BooleanField() def get_is_cross_host_like(from_resource, to_resource): return from_resource.normalized_host_like != to_resource.normalized_host_like + class Crawl(Model): """ Attempts to crawl a page. """ - page = ForeignKeyField(Page, backref="crawls", on_delete='CASCADE') + page = ForeignKeyField(Page, backref="crawls", on_delete="CASCADE") status = IntegerField() error_message = TextField(null=True) is_different = BooleanField() timestamp = DateTimeField() + class Search(Model): """ A log of performed searches @@ -81,19 +87,22 @@ class Search(Model): query = TextField() timestamp = DateTimeField() + class Thread(Model): """ Thread definitions. """ + updated_at = DateTimeField() + class ThreadPage(Model): """ Mapping table of threads to their member pages. """ thread = ForeignKeyField(Thread, backref="pages", on_delete="CASCADE") - page = ForeignKeyField(Page, backref="threads", on_delete='CASCADE') + page = ForeignKeyField(Page, backref="threads", on_delete="CASCADE") address = TextField() friendly_author = TextField() friendly_title = TextField() diff --git a/gus/lib/domain.py b/gus/lib/domain.py @@ -1,5 +1,6 @@ import re + def is_domain(possible_domain): domain_pattern = "^((?=[a-z0-9-]{1,63}\.)(xn--)?[a-z0-9]+(-[a-z0-9]+)*\.)+(aaa|aarp|abarth|abb|abbott|abbvie|abc|able|abogado|abudhabi|ac|academy|accenture|accountant|accountants|aco|active|ac|or|ad|adac|ads|adult|ae|aeg|aero|aetna|af|afamilycompany|afl|africa|ag|agakhan|agency|ai|aig|aigo|airbus|airforce|airtel|akdn|al|alfaromeo|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|am|americanexpress|americanfamily|amex|amfam|amica|amsterdam|an|analytics|android|anquan|anz|ao|aol|apartments|app|apple|aq|aquarelle|ar|arab|aramco|archi|army|arpa|art|arte|as|asda|asia|associates|at|athleta|attorney|au|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aw|aws|ax|axa|az|azure|ba|baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays|barefoot|bargains|baseball|basketball|bauhaus|bayern|bb|bbc|bbt|bbva|bcg|bcn|bd|be|beats|beauty|beer|bentley|berlin|best|bestbuy|bet|bf|bg|bh|bharti|bi|bible|bid|bike|bing|bingo|bio|biz|bj|bl|black|blackfriday|blanco|blockbuster|blog|bloomberg|blue|bm|bms|bmw|bn|bnl|bnpparibas|bo|boats|boehringer|bofa|bom|bond|boo|book|booking|boots|bosch|bostik|boston|bot|boutique|box|bq|br|bradesco|bridgestone|broadway|broker|brother|brussels|bs|bt|budapest|bugatti|build|builders|business|buy|buzz|bv|bw|by|bz|bzh|ca|cab|cafe|cal|call|calvinklein|cam|camera|camp|cancerresearch|canon|capetown|capital|capitalone|car|caravan|cards|care|career|careers|cars|cartier|casa|case|caseih|cash|casino|cat|catering|catholic|cba|cbn|cbre|cbs|cc|cd|ceb|center|ceo|cern|cf|cfa|cfd|cg|ch|chanel|channel|charity|chase|chat|cheap|chintai|chloe|christmas|chrome|chrysler|church|ci|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|ck|cl|claims|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|cm|cn|co|coach|codes|coffee|college|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction|consulting|contact|contractors|cooking|cookingchannel|cool|coop|corsica|country|coupon|coupons|courses|cr|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|csc|cu|cuisinella|cv|cw|cx|cy|cymru|cyou|cz|dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|de|deal|dealer|deals|degree|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet|digital|direct|directory|discount|discover|dish|diy|dj|dk|dm|dnp|do|docs|doctor|dodge|dog|doha|domains|doosan|dot|download|drive|dtv|dubai|duck|dunlop|duns|dupont|durban|dvag|dvr|dz|earth|eat|ec|eco|edeka|edu|education|ee|eg|eh|email|emerck|energy|engineer|engineering|enterprises|epost|epson|equipment|er|ericsson|erni|es|esq|estate|esurance|et|etisalat|eu|eurovision|eus|events|everbank|exchange|expert|exposed|express|extraspace|fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback|ferrari|ferrero|fi|fiat|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale|fish|fishing|fit|fitness|fj|fk|flickr|flights|flir|florist|flowers|flsmidth|fly|fm|fo|foo|food|foodnetwork|football|ford|forex|forsale|forum|foundation|fox|fr|free|fresenius|frl|frogans|frontdoor|frontier|ftr|fujitsu|fujixerox|fun|fund|furniture|futbol|fyi|ga|gal|gallery|gallo|gallup|game|games|gap|garden|gb|gbiz|gd|gdn|ge|gea|gent|genting|george|gf|gg|ggee|gh|gi|gift|gifts|gives|giving|gl|glade|glass|gle|global|globo|gm|gmail|gmbh|gmo|gmx|gn|godaddy|gold|goldpoint|golf|goo|goodhands|goodyear|goog|google|gop|got|gov|gp|gq|gr|grainger|graphics|gratis|green|gripe|grocery|group|gs|gt|gu|guardian|gucci|guge|guide|guitars|guru|gw|gy|hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here|hermes|hgtv|hiphop|hisamitsu|hitachi|hiv|hk|hkt|hm|hn|hockey|holdings|holiday|homedepot|homegoods|homes|homesense|honda|honeywell|horse|hospital|host|hosting|hot|hoteles|hotels|hotmail|house|how|hr|hsbc|ht|htc|hu|hughes|hyatt|hyundai|ibm|icbc|ice|icu|id|ie|ieee|ifm|iinet|ikano|il|im|imamat|imdb|immo|immobilien|in|industries|infiniti|info|ing|ink|institute|insurance|insure|int|intel|international|intuit|investments|io|ipiranga|iq|ir|irish|is|iselect|ismaili|ist|istanbul|it|itau|itv|iveco|iwc|jaguar|java|jcb|jcp|je|jeep|jetzt|jewelry|jio|jlc|jll|jm|jmp|jnj|jo|jobs|joburg|jot|joy|jp|jpmorgan|jprs|juegos|juniper|kaufen|kddi|ke|kerryhotels|kerrylogistics|kerryproperties|kfh|kg|kh|ki|kia|kim|kinder|kindle|kitchen|kiwi|km|kn|koeln|komatsu|kosher|kp|kpmg|kpn|kr|krd|kred|kuokgroup|kw|ky|kyoto|kz|la|lacaixa|ladbrokes|lamborghini|lamer|lancaster|lancia|lancome|land|landrover|lanxess|lasalle|lat|latino|latrobe|law|lawyer|lb|lc|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|li|liaison|lidl|life|lifeinsurance|lifestyle|lighting|like|lilly|limited|limo|lincoln|linde|link|lipsy|live|living|lixil|lk|llc|loan|loans|locker|locus|loft|lol|london|lotte|lotto|love|lpl|lplfinancial|lr|ls|lt|ltd|ltda|lu|lundbeck|lupin|luxe|luxury|lv|ly|ma|macys|madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott|marshalls|maserati|mattel|mba|mc|mcd|mcdonalds|mckinsey|md|me|med|media|meet|melbourne|meme|memorial|men|menu|meo|merckmsd|metlife|mf|mg|mh|miami|microsoft|mil|mini|mint|mit|mitsubishi|mk|ml|mlb|mls|mm|mma|mn|mo|mobi|mobile|mobily|moda|moe|moi|mom|monash|money|monster|montblanc|mopar|mormon|mortgage|moscow|moto|motorcycles|mov|movie|movistar|mp|mq|mr|ms|msd|mt|mtn|mtpc|mtr|mu|museum|mutual|mutuelle|mv|mw|mx|my|mz|na|nab|nadex|nagoya|name|nationwide|natura|navy|nba|nc|ne|nec|net|netbank|netflix|network|neustar|new|newholland|news|next|nextdirect|nexus|nf|nfl|ng|ngo|nhk|ni|nico|nike|nikon|ninja|nissan|nissay|nl|no|nokia|northwesternmutual|norton|now|nowruz|nowtv|np|nr|nra|nrw|ntt|nu|nyc|nz|obi|observer|off|office|okinawa|olayan|olayangroup|oldnavy|ollo|om|omega|one|ong|onl|online|onyourside|ooo|open|oracle|orange|org|organic|orientexpress|origins|osaka|otsuka|ott|ovh|pa|page|pamperedchef|panasonic|panerai|paris|pars|partners|parts|party|passagens|pay|pccw|pe|pet|pf|pfizer|pg|ph|pharmacy|phd|philips|phone|photo|photography|photos|physio|piaget|pics|pictet|pictures|pid|pin|ping|pink|pioneer|pizza|pk|pl|place|play|playstation|plumbing|plus|pm|pn|pnc|pohl|poker|politie|porn|post|pr|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties|property|protection|pru|prudential|ps|pt|pub|pw|pwc|py|qa|qpon|quebec|quest|qvc|racing|radio|raid|re|read|realestate|realtor|realty|recipes|red|redstone|redumbrella|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant|review|reviews|rexroth|rich|richardli|ricoh|rightathome|ril|rio|rip|rmit|ro|rocher|rocks|rodeo|rogers|room|rs|rsvp|ru|rugby|ruhr|run|rw|rwe|ryukyu|sa|saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|sas|save|saxo|sb|sbi|sbs|sc|sca|scb|schaeffler|schmidt|scholarships|school|schule|schwarz|science|scjohnson|scor|scot|sd|se|search|seat|secure|security|seek|select|sener|services|ses|seven|sew|sex|sexy|sfr|sg|sh|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping|shouji|show|showtime|shriram|si|silk|sina|singles|site|sj|sk|ski|skin|sky|skype|sl|sling|sm|smart|smile|sn|sncf|so|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|space|spiegel|sport|spot|spreadbetting|sr|srl|srt|ss|st|stada|staples|star|starhub|statebank|statefarm|statoil|stc|stcgroup|stockholm|storage|store|stream|studio|study|style|su|sucks|supplies|supply|support|surf|surgery|suzuki|sv|swatch|swiftcover|swiss|sx|sy|sydney|symantec|systems|sz|tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tc|tci|td|tdk|team|tech|technology|tel|telecity|telefonica|temasek|tennis|teva|tf|tg|th|thd|theater|theatre|tiaa|tickets|tienda|tiffany|tips|tires|tirol|tj|tjmaxx|tjx|tk|tkmaxx|tl|tm|tmall|tn|to|today|tokyo|tools|top|toray|toshiba|total|tours|town|toyota|toys|tp|tr|trade|trading|training|travel|travelchannel|travelers|travelersinsurance|trust|trv|tt|tube|tui|tunes|tushu|tv|tvs|tw|tz|ua|ubank|ubs|uconnect|ug|uk|um|unicom|university|uno|uol|ups|us|uy|uz|va|vacations|vana|vanguard|vc|ve|vegas|ventures|verisign|versicherung|vet|vg|vi|viajes|video|vig|viking|villas|vin|vip|virgin|visa|vision|vista|vistaprint|viva|vivo|vlaanderen|vn|vodka|volkswagen|volvo|vote|voting|voto|voyage|vu|vuelos|wales|walmart|walter|wang|wanggou|warman|watch|watches|weather|weatherchannel|webcam|weber|website|wed|wedding|weibo|weir|wf|whoswho|wien|wiki|williamhill|win|windows|wine|winners|wme|wolterskluwer|woodside|work|works|world|wow|ws|wtc|wtf|xbox|xerox|xfinity|xihuan|xin|测试|कॉम|परीक्षा|セール|佛山|ಭಾರತ|慈善|集团|在线|한국|ଭାରତ|大众汽车|点看|คอม|ভাৰত|ভারত|八卦|‏موقع‎|বাংলা|公益|公司|香格里拉|网站|移动|我爱你|москва|испытание|қаз|католик|онлайн|сайт|联通|срб|бг|бел|‏קום‎|时尚|微博|테스트|淡马锡|ファッション|орг|नेट|ストア|삼성|சிங்கப்பூர்|商标|商店|商城|дети|мкд|‏טעסט‎|ею|ポイント|新闻|工行|家電|‏كوم‎|中文网|中信|中国|中國|娱乐|谷歌|భారత్|ලංකා|電訊盈科|购物|測試|クラウド|ભારત|通販|भारतम्|भारत|भारोत|‏آزمایشی‎|பரிட்சை|网店|संगठन|餐厅|网络|ком|укр|香港|诺基亚|食品|δοκιμή|飞利浦|‏إختبار‎|台湾|台灣|手表|手机|мон|‏الجزائر‎|‏عمان‎|‏ارامكو‎|‏ایران‎|‏العليان‎|‏اتصالات‎|‏امارات‎|‏بازار‎|‏موريتانيا‎|‏پاکستان‎|‏الاردن‎|‏موبايلي‎|‏بارت‎|‏بھارت‎|‏المغرب‎|‏ابوظبي‎|‏السعودية‎|‏ڀارت‎|‏كاثوليك‎|‏سودان‎|‏همراه‎|‏عراق‎|‏مليسيا‎|澳門|닷컴|政府|‏شبكة‎|‏بيتك‎|‏عرب‎|გე|机构|组织机构|健康|ไทย|‏سورية‎|招聘|рус|рф|珠宝|‏تونس‎|大拿|みんな|グーグル|ελ|世界|書籍|ഭാരതം|ਭਾਰਤ|网址|닷넷|コム|天主教|游戏|vermögensberater|vermögensberatung|企业|信息|嘉里大酒店|嘉里|‏مصر‎|‏قطر‎|广东|இலங்கை|இந்தியா|հայ|新加坡|‏فلسطين‎|テスト|政务|xperia|xxx|xyz|yachts|yahoo|yamaxun|yandex|ye|yodobashi|yoga|yokohama|you|youtube|yt|yun|za|zappos|zara|zero|zip|zippo|zm|zone|zuerich|zw)$" domain_match = re.match(domain_pattern, possible_domain, re.I) diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -1,5 +1,12 @@ import re -from urllib.parse import unquote, urljoin, urlsplit, urlunsplit, uses_relative, uses_netloc +from urllib.parse import ( + unquote, + urljoin, + urlsplit, + urlunsplit, + uses_relative, + uses_netloc, +) from urllib.robotparser import RobotFileParser import gusmobile @@ -12,22 +19,47 @@ from gus.lib.domain import is_domain uses_relative.append("gemini") uses_netloc.append("gemini") -LOG_ROOT_LIKE_PATTERN = re.compile(".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", flags=re.IGNORECASE) -LOG_POST_LIKE_PATTERN = re.compile(".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", flags=re.IGNORECASE) -LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile(".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", flags=re.IGNORECASE) -LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile("^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE) -LOG_POST_BOSTON_LIKE_PATTERN = re.compile("^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE) - -ROOT_LIKE_ONLY_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE) -ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE) +LOG_ROOT_LIKE_PATTERN = re.compile( + ".*/(gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/?$", + flags=re.IGNORECASE, +) +LOG_POST_LIKE_PATTERN = re.compile( + ".*/((gemlog|glog|journal|twinlog|posts|post|tangents|phlog|starlog|pikkulog|blog|log)/.+$|.*/\d{4}[-_]\d{2}[-_]\d{2}.*$|.*/(19|20)\d{6}.*$)", + flags=re.IGNORECASE, +) +LOG_POST_LIKE_EXCLUSION_PATTERN = re.compile( + ".*/(games|archive|archives|rss|handlers|diagnostics)/.*|.*atom.xml$|.*gemlog.gmi$|.*index.gmi$|.*index.gemini$", + flags=re.IGNORECASE, +) +LOG_POST_GEMLOGBLUE_LIKE_PATTERN = re.compile( + "^/users/[a-z][-a-z0-9]*/\d+\.gmi?", flags=re.IGNORECASE +) +LOG_POST_BOSTON_LIKE_PATTERN = re.compile( + "^/boston/\d{4}/\d{2}/\d+\.\d+", flags=re.IGNORECASE +) + +ROOT_LIKE_ONLY_PATTERN = re.compile( + "^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$", flags=re.IGNORECASE +) +ROOT_LIKE_PATTERN = re.compile( + "^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?", flags=re.IGNORECASE +) PIKKULOG_LIKE_PATTERN = re.compile(".*/pikkulog/.*", flags=re.IGNORECASE) -AUTHOR_URL_PATTERN = re.compile("^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE) -AUTHOR_CONTENT_PATTERN = re.compile(".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE) +AUTHOR_URL_PATTERN = re.compile( + "^/~([a-z][-a-z0-9]*)/|^/users/~?([a-z][-a-z0-9]*)", flags=re.IGNORECASE +) +AUTHOR_CONTENT_PATTERN = re.compile( + ".*(by|author): ([\w\s\d]+)", flags=re.IGNORECASE | re.MULTILINE +) TITLE_CONTENT_PATTERN = re.compile("^#\s(.*)$", flags=re.IGNORECASE | re.MULTILINE) -TITLE_URL_PATTERN = re.compile(".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", flags=re.IGNORECASE) +TITLE_URL_PATTERN = re.compile( + ".*/(\d{8}[-_]|\d{4}[-_]\d{2}[-_]\d{2}[-_])?([a-z0-9-_]+)(\.[a-z0-9]+)$", + flags=re.IGNORECASE, +) + class GeminiRobotFileParser(RobotFileParser): def set_url(self, url): @@ -36,7 +68,6 @@ class GeminiRobotFileParser(RobotFileParser): u, _ = GeminiResource.urlsplit_featureful(url) self.host, self.path = u[1:3] - def read(self): """Reads the robots.txt URL and feeds it to the parser.""" gr = GeminiResource(self.url) @@ -50,7 +81,7 @@ class GeminiRobotFileParser(RobotFileParser): self.parse(response.content.splitlines()) -class GeminiResource(): +class GeminiResource: def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None): self.raw_url = url self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful( @@ -80,7 +111,7 @@ class GeminiResource(): # things behind the scenes. is_relative = False - u = urlsplit(url, 'gemini') + u = urlsplit(url, "gemini") if u.scheme != "gemini": return None, None if u.hostname is None: @@ -89,9 +120,9 @@ class GeminiResource(): if parent_hostname is None: return None, None joined = urljoin("gemini://{}".format(parent_hostname), url) - u = urlsplit(joined, 'gemini') + u = urlsplit(joined, "gemini") is_relative = True - else: # url does not start with / + else: # url does not start with / # could be: blah.com/test # could be: test url_split = url.split("/") @@ -99,33 +130,36 @@ class GeminiResource(): # prepend with "gemini://" so built-in urlsplit will extract # the host properly, and continue on url = "gemini://{}".format(url) - u = urlsplit(url, 'gemini') + u = urlsplit(url, "gemini") else: # process relative link if fully_qualified_parent_url is None: return None, None joined = urljoin(fully_qualified_parent_url, url) - u = urlsplit(joined, 'gemini') + u = urlsplit(joined, "gemini") is_relative = True return u, is_relative - def _get_normalized_url(self): if not self.is_valid: return None if self._normalized_url is None: - self._normalized_url, self._normalized_host = self._get_normalized_url_and_host() + ( + self._normalized_url, + self._normalized_host, + ) = self._get_normalized_url_and_host() return self._normalized_url - def _get_normalized_host(self): if not self.is_valid: return None if self._normalized_host is None: - self._normalized_url, self._normalized_host = self._get_normalized_url_and_host() + ( + self._normalized_url, + self._normalized_host, + ) = self._get_normalized_url_and_host() return self._normalized_host - def _get_normalized_host_like(self): if not self.is_valid: return None @@ -137,7 +171,6 @@ class GeminiResource(): self._normalized_host_like = normalized_host_like return self._normalized_host_like - def _get_fetchable_url(self): if not self.is_valid: return None @@ -162,27 +195,32 @@ class GeminiResource(): self._fetchable_url = url return self._fetchable_url - def _get_indexable_url(self): if not self.is_valid: return None if self._indexable_url is None: indexable_url = unquote(self.fetchable_url) if self.urlsplit.port == 1965: - indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) + indexable_url = self.normalized_url.replace( + self.urlsplit.hostname.lower() + ":1965", + self.urlsplit.hostname.lower(), + 1, + ) self._indexable_url = indexable_url return self._indexable_url - def _get_is_root_like(self): if self._is_root_like is None: is_root_like = False - if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path): + if ( + self.urlsplit.path == "" + or self.urlsplit.path == "/" + or ROOT_LIKE_ONLY_PATTERN.match(self.urlsplit.path) + ): is_root_like = True self._is_root_like = is_root_like return self._is_root_like - def _get_is_pikkulog_like(self): if self._is_pikkulog_like is None: is_pikkulog_like = False @@ -192,30 +230,39 @@ class GeminiResource(): self._is_pikkulog_like = is_pikkulog_like return self._is_pikkulog_like - def _get_is_log_root_like(self): if self._is_log_root_like is None: is_log_root_like = False - if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path): + if ( + self.urlsplit.path == "" + or self.urlsplit.path == "/" + or LOG_ROOT_LIKE_PATTERN.match(self.urlsplit.path) + ): is_log_root_like = True self._is_log_root_like = is_log_root_like return self._is_log_root_like - def _get_is_log_post_like(self): if self._is_log_post_like is None: is_log_post_like = False post_like_match = LOG_POST_LIKE_PATTERN.match(self.urlsplit.path) - post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match(self.urlsplit.path) - post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match(self.urlsplit.path) + post_like_exclusion_match = LOG_POST_LIKE_EXCLUSION_PATTERN.match( + self.urlsplit.path + ) + post_gemlogblue_match = LOG_POST_GEMLOGBLUE_LIKE_PATTERN.match( + self.urlsplit.path + ) post_boston_match = LOG_POST_BOSTON_LIKE_PATTERN.match(self.urlsplit.path) - if (post_like_match and not post_like_exclusion_match) or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) or (self.normalized_host == "gemini.conman.org" and post_boston_match): + if ( + (post_like_match and not post_like_exclusion_match) + or (self.normalized_host == "gemlog.blue" and post_gemlogblue_match) + or (self.normalized_host == "gemini.conman.org" and post_boston_match) + ): is_log_post_like = True self._is_log_post_like = is_log_post_like return self._is_log_post_like - def get_friendly_author(self, content): if not self.is_valid: return None @@ -238,7 +285,6 @@ class GeminiResource(): friendly_author = self.normalized_host return friendly_author - def get_friendly_title(self, content): if not self.is_valid: return None @@ -253,13 +299,18 @@ class GeminiResource(): # if no content match, try looking in URL title_url_match = TITLE_URL_PATTERN.match(self.urlsplit.path) if title_url_match: - friendly_title = title_url_match[2].replace("-", " ").replace("_", " ").strip().title() + friendly_title = ( + title_url_match[2] + .replace("-", " ") + .replace("_", " ") + .strip() + .title() + ) if friendly_title is None: # if still no match, use URL path friendly_title = self.urlsplit.path.lstrip("/") return friendly_title - def get_default_change_frequency(self, category): if not self.is_valid: return None @@ -287,7 +338,6 @@ class GeminiResource(): self._default_change_frequency = change_frequency return self._default_change_frequency - def increment_change_frequency(self, existing_change_frequency, category): if category == "content": if self.is_root_like or self.is_log_root_like: @@ -309,7 +359,6 @@ class GeminiResource(): else: raise Exception.NameError("Unrecognized resource category") - # constructed from fetchable_url # does not matter if quoted or unquoted so I choose arbitrarily to # standardize on unquoting it. @@ -333,15 +382,17 @@ class GeminiResource(): # and a server redirecting to the same URL _with_ a trailing slash. return gusmobile.fetch(self.fetchable_url) - def _get_normalized_url_and_host(self): url_normalized = unquote(self.fetchable_url.lower().rstrip("/")) if self.urlsplit.port == 1965: - url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) + url_normalized = url_normalized.replace( + self.urlsplit.hostname.lower() + ":1965", + self.urlsplit.hostname.lower(), + 1, + ) host_normalized = self.urlsplit.hostname.lower() return url_normalized, host_normalized - def extract_contained_resources(self, content): # this finds all gemini URLs within the content of a given GeminiResource and # returns them as a list of new GeminiResources @@ -349,9 +400,13 @@ class GeminiResource(): return self.contained_resources link_pattern = "^=>\s*(\S+)" - preformat_pattern = r'^```.*?^```' - content_without_preformat = re.sub(preformat_pattern, '', content, flags=re.DOTALL | re.MULTILINE) - probable_urls = re.findall(link_pattern, content_without_preformat, re.MULTILINE) + preformat_pattern = r"^```.*?^```" + content_without_preformat = re.sub( + preformat_pattern, "", content, flags=re.DOTALL | re.MULTILINE + ) + probable_urls = re.findall( + link_pattern, content_without_preformat, re.MULTILINE + ) resources = [] for url in probable_urls: resource = GeminiResource( diff --git a/gus/lib/logging.py b/gus/lib/logging.py @@ -7,11 +7,11 @@ def add_arguments(parser): """Add arguments to the given argument argparse parser.""" parser.add_argument( - '--logging-config', - '-c', - dest='logging_ini_fname', + "--logging-config", + "-c", + dest="logging_ini_fname", default=False, - help='Location of logging configuration file' + help="Location of logging configuration file", ) @@ -22,11 +22,10 @@ def handle_arguments(args): if os.path.isfile(args.logging_ini_fname): logging.config.fileConfig(args.logging_ini_fname) else: - sys.exit('Can not find logging ini file: %s' % - args.logging_ini_fname) + sys.exit("Can not find logging ini file: %s" % args.logging_ini_fname) - elif os.path.isfile('logging.ini'): - logging.config.fileConfig('logging.ini') + elif os.path.isfile("logging.ini"): + logging.config.fileConfig("logging.ini") def strip_control_chars(s): diff --git a/gus/lib/misc.py b/gus/lib/misc.py @@ -8,15 +8,24 @@ License: MIT """ SYMBOLS = { - 'customary' : ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'), - 'customary_ext' : ('byte', 'kilo', 'mega', 'giga', 'tera', 'peta', 'exa', - 'zetta', 'iotta'), - 'iec' : ('Bi', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi', 'Yi'), - 'iec_ext' : ('byte', 'kibi', 'mebi', 'gibi', 'tebi', 'pebi', 'exbi', - 'zebi', 'yobi'), + "customary": ("B", "K", "M", "G", "T", "P", "E", "Z", "Y"), + "customary_ext": ( + "byte", + "kilo", + "mega", + "giga", + "tera", + "peta", + "exa", + "zetta", + "iotta", + ), + "iec": ("Bi", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi"), + "iec_ext": ("byte", "kibi", "mebi", "gibi", "tebi", "pebi", "exbi", "zebi", "yobi"), } -def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'): + +def bytes2human(n, format="%(value).1f %(symbol)s", symbols="customary"): """ Convert n bytes into a human readable string based on format. symbols can be either "customary", "customary_ext", "iec" or "iec_ext", @@ -59,7 +68,7 @@ def bytes2human(n, format='%(value).1f %(symbol)s', symbols='customary'): symbols = SYMBOLS[symbols] prefix = {} for i, s in enumerate(symbols[1:]): - prefix[s] = 1 << (i+1)*10 + prefix[s] = 1 << (i + 1) * 10 for symbol in reversed(symbols[1:]): if n >= prefix[symbol]: value = float(n) / prefix[symbol] diff --git a/gus/lib/whoosh_extensions.py b/gus/lib/whoosh_extensions.py @@ -16,7 +16,12 @@ def UrlAnalyzer(): """ - return RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) | IntraWordFilter() | LowercaseFilter() | StemFilter() + return ( + RegexTokenizer(expression=":1965|^gemini://|[/\.\?]", gaps=True) + | IntraWordFilter() + | LowercaseFilter() + | StemFilter() + ) class GeminiFormatter(highlight.Formatter): @@ -35,7 +40,6 @@ class GeminiFormatter(highlight.Formatter): # string return "%s" % tokentext - def format_fragment(self, fragment, replace=False): """Returns a formatted version of the given text, using the "token" objects in the given :class:`Fragment`. @@ -57,21 +61,22 @@ class GeminiFormatter(highlight.Formatter): if t.startchar < index: continue if t.startchar > index: - output.append(self._text(text[index:t.startchar])) + output.append(self._text(text[index : t.startchar])) output.append(self.format_token(text, t, replace)) index = t.endchar - output.append(self._text(text[index:fragment.endchar])) + output.append(self._text(text[index : fragment.endchar])) output.append("...") out_string = "".join(output) - out_string = out_string.replace("\n", " ").replace('\r', ' ') - out_string = ' '.join(out_string.split()) + out_string = out_string.replace("\n", " ").replace("\r", " ") + out_string = " ".join(out_string.split()) return out_string special_char_pattern = re.compile("[^\w\s,\.;-\?\!']") link_pattern = re.compile("://|=>") + class GeminiScorer(highlight.FragmentScorer): def __call__(self, f): # Add up the boosts for the matched terms in this passage @@ -87,10 +92,12 @@ class GeminiScorer(highlight.FragmentScorer): # ascii art, as well as source code (which, I suppose will make snippets # lower quality for actual searches for source code, but that is a very # small minority of searches in the current state of things). - num_special_chars = len(special_char_pattern.findall(f.text[f.startchar:f.endchar])) + num_special_chars = len( + special_char_pattern.findall(f.text[f.startchar : f.endchar]) + ) score -= 4 * num_special_chars + math.pow(num_special_chars, 1.5) - num_links = len(link_pattern.findall(f.text[f.startchar:f.endchar])) + num_links = len(link_pattern.findall(f.text[f.startchar : f.endchar])) score -= 30 * num_links return max(0, score) diff --git a/serve/constants.py b/serve/constants.py @@ -44,9 +44,5 @@ QUOTE_BANK = [ "quote": "The truth will set you free. But not until it is finished with you.", "author": "David Foster Wallace", }, - { - "quote": "Jazz isn't dead. It just smells funny.", - "author": "Frank Zappa", - }, - + {"quote": "Jazz isn't dead. It just smells funny.", "author": "Frank Zappa",}, ] diff --git a/serve/main.py b/serve/main.py @@ -4,25 +4,13 @@ import jetforce from . import app, gus + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--host", - help="Server address to bind to", - default="127.0.0.1" - ) - parser.add_argument( - "--port", - help="Server port to bind to", - type=int, - default=1965 - ) - parser.add_argument( - "--hostname", - help="Server hostname", - default="localhost" - ) + parser.add_argument("--host", help="Server address to bind to", default="127.0.0.1") + parser.add_argument("--port", help="Server port to bind to", type=int, default=1965) + parser.add_argument("--hostname", help="Server hostname", default="localhost") parser.add_argument( "--tls-certfile", dest="certfile", diff --git a/serve/models.py b/serve/models.py @@ -9,11 +9,15 @@ from whoosh.index import open_dir from . import constants from gus.lib.db_model import init_db, Crawl, Link, Page, Search, Thread from gus.lib.gemini import GeminiResource -from gus.lib.index_statistics import compute_index_statistics, load_all_statistics_from_file +from gus.lib.index_statistics import ( + compute_index_statistics, + load_all_statistics_from_file, +) from gus.lib.misc import bytes2human from gus.lib.whoosh_extensions import GeminiFormatter, GeminiScorer -class GUS(): + +class GUS: def __init__(self): self.ix = open_dir(constants.INDEX_DIR) self.searcher = self.ix.searcher() @@ -27,12 +31,15 @@ class GUS(): self.db = init_db(f"{constants.INDEX_DIR}/{constants.DB_FILENAME}") self.statistics = compute_index_statistics(self.db) - self.statistics_historical_overall = load_all_statistics_from_file(constants.STATISTICS_FILE) - + self.statistics_historical_overall = load_all_statistics_from_file( + constants.STATISTICS_FILE + ) def init_query_parser(ix): or_group = qparser.OrGroup.factory(0.99) - query_parser = qparser.MultifieldParser(["content", "url", "prompt"], ix.schema, group=or_group) + query_parser = qparser.MultifieldParser( + ["content", "url", "prompt"], ix.schema, group=or_group + ) query_parser.add_plugin(qparser.RegexPlugin()) query_parser.add_plugin(qparser.GtLtPlugin()) query_parser.remove_plugin_class(qparser.WildcardPlugin) @@ -40,36 +47,44 @@ class GUS(): query_parser.remove_plugin_class(qparser.RangePlugin) return query_parser - def search_index(self, query, requested_page): Search.create(query=query, timestamp=datetime.utcnow()) query = self.query_parser.parse(query) results = self.searcher.search_page(query, requested_page, pagelen=10) return ( len(results), - [{ - "score" : result.score, - "indexed_at" : result["indexed_at"], - "url" : result["url"], - "fetchable_url" : result["fetchable_url"], - "content_type" : result["content_type"], - "charset" : result["charset"] if "charset" in result else "none", - "size" : result["size"] if "size" in result else 0, - "prompt" : result["prompt"] if "prompt" in result else "", - "highlights" : self.gemini_highlighter.highlight_hit(result, "content", top=1) if "content" in result and result["content_type"] in ["text/plain", "text/gemini", "text/markdown"] else "", - "link_text" : GUS._get_link_text(result), - "backlink_count": result["backlink_count"], - } for result in results], + [ + { + "score": result.score, + "indexed_at": result["indexed_at"], + "url": result["url"], + "fetchable_url": result["fetchable_url"], + "content_type": result["content_type"], + "charset": result["charset"] if "charset" in result else "none", + "size": result["size"] if "size" in result else 0, + "prompt": result["prompt"] if "prompt" in result else "", + "highlights": self.gemini_highlighter.highlight_hit( + result, "content", top=1 + ) + if "content" in result + and result["content_type"] + in ["text/plain", "text/gemini", "text/markdown"] + else "", + "link_text": GUS._get_link_text(result), + "backlink_count": result["backlink_count"], + } + for result in results + ], ) - def get_backlinks(self, url): resource = GeminiResource(url) if not resource.is_valid: return [], [] u = resource.indexable_url.rstrip("/") - backlinks_query = Page.raw("""SELECT p_from.url, l.is_cross_host_like + backlinks_query = Page.raw( + """SELECT p_from.url, l.is_cross_host_like FROM page AS p_from JOIN indexable_crawl AS ic ON ic.page_id == p_from.id @@ -80,18 +95,22 @@ ON p_to.id == l.to_page_id WHERE p_to.url IN (?, ?) AND p_from.normalized_url != ? GROUP BY p_from.normalized_url -ORDER BY l.is_cross_host_like, p_from.url ASC""", u, f"{u}/", resource.normalized_url) +ORDER BY l.is_cross_host_like, p_from.url ASC""", + u, + f"{u}/", + resource.normalized_url, + ) backlinks = backlinks_query.execute() internal_backlink_urls = [b.url for b in backlinks if not b.is_cross_host_like] external_backlink_urls = [b.url for b in backlinks if b.is_cross_host_like] return internal_backlink_urls, external_backlink_urls - def get_threads(self, sort="recency"): sort = sort.lower() if sort == "recency": - threads_query = Thread.raw("""SELECT t.* + threads_query = Thread.raw( + """SELECT t.* , tp.address , tp.friendly_author , tp.friendly_title @@ -111,9 +130,11 @@ JOIN crawl AS c ON c.page_id == p.id WHERE c.status == 20 GROUP BY tp.id -ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""") +ORDER BY t.updated_at DESC, t.id ASC, tp.address ASC""" + ) elif sort == "length": - threads_query = Thread.raw("""SELECT t.* + threads_query = Thread.raw( + """SELECT t.* , tp.address , tp.friendly_author , tp.friendly_title @@ -136,7 +157,8 @@ JOIN crawl AS c ON c.page_id == p.id WHERE c.status == 20 GROUP BY tp.id -ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC""") +ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC""" + ) else: threads_query = "" threads = [] @@ -144,44 +166,52 @@ ORDER BY t.thread_length DESC, t.updated_at DESC, t.id ASC, tp.address ASC""") last_id = None for thread_member in threads_query.iterator(): if thread_member.updated_at.date() != last_date: - threads.append({ - "threads": [], - "date": thread_member.updated_at, - }) + threads.append( + {"threads": [], "date": thread_member.updated_at,} + ) last_date = thread_member.updated_at.date() if thread_member.id != last_id: - threads[-1]["threads"].append({ - "members": [], - "updated_at": thread_member.updated_at, - }) + threads[-1]["threads"].append( + {"members": [], "updated_at": thread_member.updated_at,} + ) last_id = thread_member.id - threads[-1]["threads"][-1]["members"].append({ - "url": thread_member.url, - "fetchable_url": thread_member.fetchable_url, - "address": thread_member.address, - "friendly_author": thread_member.friendly_author, - "friendly_title": thread_member.friendly_title, - "first_seen": datetime.strptime(thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f"), - }) + threads[-1]["threads"][-1]["members"].append( + { + "url": thread_member.url, + "fetchable_url": thread_member.fetchable_url, + "address": thread_member.address, + "friendly_author": thread_member.friendly_author, + "friendly_title": thread_member.friendly_title, + "first_seen": datetime.strptime( + thread_member.first_seen, "%Y-%m-%d %H:%M:%S.%f" + ), + } + ) # return sorted(threads, key=lambda x: (x["updated_at"], ), reverse=True) return threads - def _get_link_text(result): if result["content_type"] == "input": prompt_suffix = ": {}".format(result["prompt"]) - link_text = "{} ({}{})".format(result["url"][9:], result["content_type"], prompt_suffix) + link_text = "{} ({}{})".format( + result["url"][9:], result["content_type"], prompt_suffix + ) else: - lang_str = ", {}".format(result["lang"]) if "lang" in result and result["lang"] != "none" else "" + lang_str = ( + ", {}".format(result["lang"]) + if "lang" in result and result["lang"] != "none" + else "" + ) link_text = "{} ({}, {})".format( - result["url"][9:], result["content_type"], - bytes2human(result["size"], format="%(value).0f%(symbol)s") + result["url"][9:], + result["content_type"], + bytes2human(result["size"], format="%(value).0f%(symbol)s"), ) return link_text - def get_feeds(self): - feeds_query = Page.raw("""SELECT DISTINCT p.* + feeds_query = Page.raw( + """SELECT DISTINCT p.* FROM page AS p JOIN indexable_crawl AS c ON c.page_id == p.id @@ -190,12 +220,13 @@ OR p.url LIKE '%feed.xml' OR p.url LIKE '%.rss' OR p.url LIKE '%.atom' OR p.content_type IN ('application/atom+xml', 'application/rss+xml') -""") +""" + ) return feeds_query.execute() - def get_newest_hosts(self): - newest_hosts_query = Page.raw("""SELECT p.domain, MIN(c.timestamp) AS first_seen + newest_hosts_query = Page.raw( + """SELECT p.domain, MIN(c.timestamp) AS first_seen FROM page as p JOIN indexable_crawl AS ic ON ic.page_id == p.id @@ -204,12 +235,13 @@ ON c.page_id == p.id GROUP BY p.domain ORDER BY first_seen DESC LIMIT 10 -""") +""" + ) return newest_hosts_query.execute() - def get_newest_pages(self): - newest_pages_query = Page.raw("""SELECT p.url, p.fetchable_url, MIN(c.timestamp) AS first_seen + newest_pages_query = Page.raw( + """SELECT p.url, p.fetchable_url, MIN(c.timestamp) AS first_seen FROM page as p JOIN indexable_crawl AS ic ON ic.page_id == p.id @@ -218,19 +250,19 @@ ON c.page_id == p.id GROUP BY p.url ORDER BY first_seen DESC LIMIT 50 -""") +""" + ) return newest_pages_query.execute() - def get_search_suggestions(self, query): suggestions = [] corrector = self.searcher.corrector("content") for query_part in query.split(" "): query_part_suggestions = corrector.suggest(query_part, limit=3) - suggestions.extend({ - "raw": suggestion, - "quoted": quote(suggestion) - } for suggestion in query_part_suggestions) + suggestions.extend( + {"raw": suggestion, "quoted": quote(suggestion)} + for suggestion in query_part_suggestions + ) return suggestions diff --git a/serve/views.py b/serve/views.py @@ -8,7 +8,12 @@ import jinja2 from jetforce import Request, Response, Status, JetforceApplication from . import constants -from .models import compute_verbose, compute_requested_results_page, GUS, process_seed_request +from .models import ( + compute_verbose, + compute_requested_results_page, + GUS, + process_seed_request, +) TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates") @@ -19,6 +24,7 @@ template_env = jinja2.Environment( lstrip_blocks=True, ) + def datetimeformat(value, format="%Y-%m-%d"): return value.strftime(format) @@ -29,8 +35,10 @@ def threadaddressformat(value): return " " * (depth - 1) + "↳" return "" -template_env.filters['datetimeformat'] = datetimeformat -template_env.filters['threadaddressformat'] = threadaddressformat + +template_env.filters["datetimeformat"] = datetimeformat +template_env.filters["threadaddressformat"] = threadaddressformat + def render_template(name: str, *args, **kwargs) -> str: """ @@ -38,9 +46,11 @@ def render_template(name: str, *args, **kwargs) -> str: """ return template_env.get_template(name).render(*args, **kwargs) + app = JetforceApplication() gus = GUS() + @app.route("/favicon.txt", strict_trailing_slash=False) def favicon(request): return Response(Status.SUCCESS, "text/plain", "🔭") @@ -58,104 +68,128 @@ def add_seed(request): @app.route("/statistics", strict_trailing_slash=False) def statistics(request): - body = render_template("statistics.gmi", - statistics=gus.statistics, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "statistics.gmi", + statistics=gus.statistics, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/statistics/historical/overall", strict_trailing_slash=False) def statistics(request): - body = render_template("statistics_historical_overall.gmi", - statistics_historical_overall=gus.statistics_historical_overall, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "statistics_historical_overall.gmi", + statistics_historical_overall=gus.statistics_historical_overall, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/known-hosts", strict_trailing_slash=False) def known_hosts(request): - body = render_template("known_hosts.gmi", - # TODO: remove this `sorted` after the next index generation - known_hosts=sorted(gus.statistics["domains"]), - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "known_hosts.gmi", + # TODO: remove this `sorted` after the next index generation + known_hosts=sorted(gus.statistics["domains"]), + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/newest-hosts", strict_trailing_slash=False) def newest_hosts(request): - body = render_template("newest_hosts.gmi", - newest_hosts=gus.get_newest_hosts(), - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "newest_hosts.gmi", + newest_hosts=gus.get_newest_hosts(), + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/newest-pages", strict_trailing_slash=False) def newest_pages(request): - body = render_template("newest_pages.gmi", - newest_pages=gus.get_newest_pages(), - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "newest_pages.gmi", + newest_pages=gus.get_newest_pages(), + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/known-feeds", strict_trailing_slash=False) def known_feeds(request): - body = render_template("known_feeds.gmi", - known_feeds=gus.get_feeds(), - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "known_feeds.gmi", + known_feeds=gus.get_feeds(), + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("", strict_trailing_slash=False) def index(request): - body = render_template("index.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "index.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/about", strict_trailing_slash=False) def index(request): - body = render_template("about.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "about.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/documentation/searching", strict_trailing_slash=False) def documentation_searching(request): - body = render_template("documentation/searching.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "documentation/searching.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/documentation/indexing", strict_trailing_slash=False) def documentation_indexing(request): - body = render_template("documentation/indexing.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "documentation/indexing.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/documentation/backlinks", strict_trailing_slash=False) def documentation_backlinks(request): - body = render_template("documentation/backlinks.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "documentation/backlinks.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @app.route("/news", strict_trailing_slash=False) def index(request): - body = render_template("news.gmi", - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "news.gmi", + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) @@ -171,23 +205,27 @@ def search(request): current_page = min(requested_page, num_pages) if num_results == 0: current_page = 0 - body = render_template("search.gmi", - query=request.query, - quoted_query=quote(request.query), - verbose=verbose, - num_results=num_results, - results=results, - current_page=current_page, - num_pages=num_pages, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "search.gmi", + query=request.query, + quoted_query=quote(request.query), + verbose=verbose, + num_results=num_results, + results=results, + current_page=current_page, + num_pages=num_pages, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) else: search_suggestions = gus.get_search_suggestions(request.query) - body = render_template("search_suggestions.gmi", - query=request.query, - search_suggestions=search_suggestions, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "search_suggestions.gmi", + query=request.query, + search_suggestions=search_suggestions, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) else: return Response(Status.INPUT, "Search query") @@ -213,12 +251,14 @@ def backlinks(request): if request.query: url = unquote(request.query) internal_backlinks, external_backlinks = gus.get_backlinks(url) - body = render_template("backlinks.gmi", - url=url, - internal_backlinks=internal_backlinks, - external_backlinks=external_backlinks, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "backlinks.gmi", + url=url, + internal_backlinks=internal_backlinks, + external_backlinks=external_backlinks, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) else: return Response(Status.INPUT, "Gemini URL") @@ -228,9 +268,11 @@ def backlinks(request): def threads(request): sort = request.query or "recency" threads = gus.get_threads(sort) - body = render_template("threads.gmi", - threads=threads, - sort=sort, - index_modification_time=gus.statistics["index_modification_time"], - quote=random.choice(constants.QUOTE_BANK)) + body = render_template( + "threads.gmi", + threads=threads, + sort=sort, + index_modification_time=gus.statistics["index_modification_time"], + quote=random.choice(constants.QUOTE_BANK), + ) return Response(Status.SUCCESS, "text/gemini", body) diff --git a/tests/gus/lib/test_gemini.py b/tests/gus/lib/test_gemini.py @@ -3,27 +3,31 @@ from gus.lib.gemini import GeminiResource class TestGeminiResource(unittest.TestCase): - def test_extract_contained_resources(self): - url = 'gemini://host' + url = "gemini://host" # no content - resources = GeminiResource(url).extract_contained_resources('') + resources = GeminiResource(url).extract_contained_resources("") self.assertEqual(resources, []) # not a link - resources = GeminiResource(url).extract_contained_resources(' => link') + resources = GeminiResource(url).extract_contained_resources(" => link") self.assertEqual(resources, []) - resources = GeminiResource(url).extract_contained_resources('```\n=> preformatted\n```') + resources = GeminiResource(url).extract_contained_resources( + "```\n=> preformatted\n```" + ) self.assertEqual(resources, []) # some links - resources = GeminiResource(url).extract_contained_resources('=> link\ntext\n=> other') + resources = GeminiResource(url).extract_contained_resources( + "=> link\ntext\n=> other" + ) self.assertEqual(len(resources), 2) - self.assertEqual(resources[0].raw_url, 'link') - self.assertEqual(resources[1].raw_url, 'other') + self.assertEqual(resources[0].raw_url, "link") + self.assertEqual(resources[1].raw_url, "other") - resources = GeminiResource(url).extract_contained_resources(""" + resources = GeminiResource(url).extract_contained_resources( + """ # title text => link @@ -32,7 +36,8 @@ text => no link ``` => other - """) + """ + ) self.assertEqual(len(resources), 2) - self.assertEqual(resources[0].raw_url, 'link') - self.assertEqual(resources[1].raw_url, 'other') + self.assertEqual(resources[0].raw_url, "link") + self.assertEqual(resources[1].raw_url, "other")