geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit df3718e3a85f47e641649f37a523ba15db81e524
parent a9806b3f479e017f0555372d95d591ad33f42cf2
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 19 Jul 2020 07:32:05 -0400

[crawl] Support per-page expiration

This will allow crawls to intelligently decide which URLs to recrawl,
if any. Some pages, like site indexes, or gemlog pages, default to
expiring much more quickly than others. This way recrawls should pick
up links to e.g., new posts, fairly quickly. Conversely, existing
posts, and binary files, are considered to be more static, and will
expire much less frequently, and thus be recrawled less frequently.

Diffstat:
Mgus/constants.py | 7+++++++
Mgus/crawl.py | 192+++++++++++++++++++++++++++++++++++--------------------------------------------
Mgus/lib/db_model.py | 14++++++++++++--
Mgus/lib/gemini.py | 26++++++++++++++++++++++++++
4 files changed, 130 insertions(+), 109 deletions(-)

diff --git a/gus/constants.py b/gus/constants.py @@ -4,3 +4,10 @@ STATISTICS_FILE = "statistics.csv" DB_FILENAME = "gus.sqlite" FEED_FILE = "feeds.txt" MAXIMUM_REDIRECT_CHAIN_LENGTH = 5 + +# default change frequencies (in hours) +DEFAULT_ROOT_CHANGE_FREQUENCY = 3 +DEFAULT_NON_ROOT_CHANGE_FREQUENCY = 24 * 7 +DEFAULT_REDIRECT_CHANGE_FREQUENCY = 24 * 7 +DEFAULT_BINARY_CHANGE_FREQUENCY = 24 * 30 +DEFAULT_PROMPT_CHANGE_FREQUENCY = 24 * 30 diff --git a/gus/crawl.py b/gus/crawl.py @@ -14,16 +14,18 @@ from peewee import ( BooleanField, DateTimeField, DoesNotExist, + fn, FloatField, ForeignKeyField, IntegerField, Model, + RawQuery, SqliteDatabase, TextField, ) from . import constants -from gus.lib.db_model import init_db, Page, Link +from gus.lib.db_model import init_db, Page, Link, Crawl from gus.lib.gemini import GeminiResource, GeminiRobotFileParser # hack(natpen): the built-in methods in urllib need to know the @@ -35,54 +37,6 @@ INDEX_DIR_CURRENT = "index" INDEX_DIR_BACKUP = INDEX_DIR_CURRENT + ".bak" INDEX_DIR_NEW = INDEX_DIR_CURRENT + ".new" -SEED_URLS = [ - # English - "gemini://80h.dev", - "gemini://berserk.red", - "gemini://bleyble.com", - "gemini://breadpunk.club", - "gemini://carcosa.net", - "gemini://consensus.circumlunar.space", - "gemini://dgold.eu", - "gemini://earthlight.xyz", - "gemini://ecmelberk.com", - "gemini://envs.net", - "gemini://freedombone.net", - "gemini://gem.bestalbumsintheuniverse.com", - "gemini://gemini.68kmentat.com", - "gemini://gemini.circumlunar.space", - "gemini://gemini.conman.org", - "gemini://gemini.digiprime.xyz", - "gemini://gemini.kvn.dev", - "gemini://gemini.logfile.ch", - "gemini://gemini.mayvaneday.art", - "gemini://gemini.uxq.ch", - "gemini://gus.guru", - "gemini://heavysquare.com", - "gemini://houston.coder.town", - "gemini://jan.bio", - "gemini://konpeito.media", - "gemini://kwiecien.us", - "gemini://libraryoferis.org", - "gemini://makeworld.gq", - "gemini://mozz.us", - "gemini://park-city.club", - "gemini://pon.ix.tc", - "gemini://rawtext.club", - "gemini://samsai.eu", - "gemini://saintnet.tech", - "gemini://tilde.black", - "gemini://tilde.pink", - "gemini://typed-hole.org", - "gemini://vger.cloud", - "gemini://vi.rs", - "gemini://yam655.com", - "gemini://zaibatsu.circumlunar.space", - - # Spanish - "gemini://gagarin.p4g.club", -] - # These are checked against normalized_url, so they should be # prepended with the gemini:// protocol, be all lowercased, and # not have the port specified if it is 1965. @@ -166,11 +120,15 @@ EXCLUDED_URL_PREFIXES = [ "gemini://chat.mozz.us/submit", # gopher proxy - "gemini://80h.dev/agena/" + "gemini://80h.dev/agena/", + + # susa.net + "gemini://gemini.susa.net/cgi-bin/search?", + "gemini://gemini.susa.net/cgi-bin/twitter?", + "gemini://gemini.susa.net/cgi-bin/vim-search?", ] EXCLUDED_URL_PATHS = [ - "atom.xml", "favicon.ico", "favicon.txt", "robots.txt", @@ -194,13 +152,33 @@ def index_binary(resource, response): "content_type": response.content_type, "charset": response.charset, "size": response.num_bytes, - "indexed_at": datetime.utcnow(), + "change_frequency": constants.DEFAULT_BINARY_CHANGE_FREQUENCY, + } + existing_page = Page.get_or_none(url=resource.indexable_url) + if existing_page: + doc["id"] = existing_page.id + doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + page = Page(**doc) + page.save() + return page + + +def index_redirect(resource): + print("INDEXING REDIRECT...") + doc = { + "url": resource.indexable_url, + "fetchable_url": resource.fetchable_url, + "domain": resource.normalized_host, + "port": resource.urlsplit.port or 1965, + "change_frequency": constants.DEFAULT_REDIRECT_CHANGE_FREQUENCY, } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id + doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] page = Page(**doc) page.save() + return page def index_prompt(resource, response): @@ -214,17 +192,20 @@ def index_prompt(resource, response): "charset": response.charset, "size": response.num_bytes, "prompt": response.prompt, - "indexed_at": datetime.utcnow(), + "change_frequency": constants.DEFAULT_PROMPT_CHANGE_FREQUENCY, } existing_page = Page.get_or_none(url=resource.indexable_url) if existing_page: doc["id"] = existing_page.id + doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] page = Page(**doc) page.save() + return page def index_content(resource, response): print("INDEXING CONTENT...") + change_frequency = constants.DEFAULT_ROOT_CHANGE_FREQUENCY if resource.is_root_like or resource.is_log_like else constants.DEFAULT_NON_ROOT_CHANGE_FREQUENCY doc = { "url": resource.indexable_url, "fetchable_url": resource.fetchable_url, @@ -234,15 +215,19 @@ def index_content(resource, response): "charset": response.charset, "content": response.content, "size": response.num_bytes, - "indexed_at": datetime.utcnow(), + "change_frequency": change_frequency, } if response.content_type == "text/gemini": doc["lang"] = response.lang or "none", existing_page = Page.get_or_none(url=resource.indexable_url) + is_different = False if existing_page: doc["id"] = existing_page.id + doc["change_frequency"] = existing_page.change_frequency or doc["change_frequency"] + is_different = existing_page.content is not None and doc["content"] != existing_page.content page = Page(**doc) page.save() + return page, is_different def index_links(from_resource, contained_resources): @@ -264,15 +249,6 @@ def index_links(from_resource, contained_resources): Link.insert_many(data).execute() -def rebuild_link_table(): - Link.delete().execute() - pages = Page.select().where(Page.content_type == "text/gemini", Page.indexed_at.is_null(False)) - for page in pages: - resource = GeminiResource(page.fetchable_url) - contained_resources = resource.extract_contained_resources(page.content) - index_links(resource, contained_resources) - - def get_robots_file(robot_host): if robot_host not in robot_file_map: print("Requesting robots.txt for {}".format(robot_host)) @@ -283,7 +259,7 @@ def get_robots_file(robot_host): return robot_file_map[robot_host] -def crawl(gemini_resource, current_depth, redirect_chain=[]): +def crawl_page(gemini_resource, current_depth, should_check_if_expired=True, redirect_chain=[]): gr = gemini_resource if max_crawl_depth >= 0 and current_depth > max_crawl_depth: print("DEPTH SKIP : %s" % gr.fetchable_url) @@ -304,6 +280,17 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("--------------------------") return + if should_check_if_expired: + # expired_pages = Page.raw("SELECT p.* FROM page as p JOIN crawl as c ON p.id == c.page_id WHERE datetime(c.timestamp, REPLACE('fnord hours', 'fnord', p.change_frequency)) < datetime('now');") + # return [page.url for page in expired_pages.execute()] + existing_page = Page.get_or_none(url=gr.indexable_url) + if existing_page and existing_page.change_frequency is not None: + most_recent_crawl = Crawl.select(fn.MAX(Crawl.timestamp)).where(Crawl.page == existing_page).scalar() + if most_recent_crawl and datetime.now() < most_recent_crawl + timedelta(hours=existing_page.change_frequency): + print("TOO SOON : %s" % gr.fetchable_url) + print("--------------------------") + return + # ROBOTS robots_file = get_robots_file(gr.normalized_host) crawl_delay = None @@ -324,12 +311,6 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("ROBOTS SKIP : %s" % gr.fetchable_url) print("--------------------------") return - if gr.normalized_url in visited_urls: - print("ALREADY SEEN : %s" % gr.fetchable_url) - print("--------------------------") - return - else: - visited_urls.append(gr.normalized_url) # Crawl delay if gr.normalized_host in domain_hit_timings: @@ -365,18 +346,20 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("REDIRECT TO SELF. ABORTING.") print("--------------------------") return - # NB: this pop is necessary because if the redirect is a change to the URL - # structure of, essentially, the same URL (e.g., like the addition or removal - # of a trailing slash), then the crawl of the redirect would think it had - # already seen this resource in visited_urls' normalized source of truth. - visited_urls.pop() - crawl(redirect_resource, current_depth, redirect_chain + [gr.fetchable_url]) + page = index_redirect(gr) + page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl.save() + index_links(gr, [redirect_resource]) + print("--------------------------") + crawl_page(redirect_resource, current_depth, should_check_if_expired=True, redirect_chain=redirect_chain + [gr.fetchable_url]) elif response.status.startswith("1"): # input status print("URL : %s" % response.url) print("STATUS : %s" % response.status) print("PROMPT : %s" % response.prompt) - index_prompt(gr, response) + page = index_prompt(gr, response) + page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl.save() print("--------------------------") elif response.status.startswith("2"): # success status @@ -384,17 +367,27 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("STATUS : %s" % response.status) print("CONTENT TYPE : %s" % response.content_type) if response.content_type.startswith("text/"): - index_content(gr, response) + page, is_different = index_content(gr, response) + page_crawl = Crawl( + page=page, + status=response.status, + is_different=is_different, + timestamp=datetime.utcnow() + ) + page_crawl.save() if response.content_type != "text/gemini": print("--------------------------") else: print("Extracting contained resources...") print("--------------------------") contained_resources = gr.extract_contained_resources(response.content) + index_links(gr, contained_resources) for resource in contained_resources: - crawl(resource, current_depth+1) + crawl_page(resource, current_depth+1, should_check_if_expired=True) else: - index_binary(gr, response) + page = index_binary(gr, response) + page_crawl = Crawl(page=page, status=response.status, is_different=False, timestamp=datetime.utcnow()) + page_crawl.save() print("--------------------------") else: # input, error, etc (all other statuses) @@ -402,15 +395,6 @@ def crawl(gemini_resource, current_depth, redirect_chain=[]): print("--------------------------") -def is_nontrivial_redirect(url, redirect_url): - return url.rstrip() != redirect_url.rstrip() - - -def load_visited_urls(index_dir): - visited_urls = [GeminiResource(p.url).normalized_url for p in Page.select()] - return visited_urls - - def pickle_robot_file_map(robot_file_map, index_dir): pickle.dump(robot_file_map, open(index_dir + "/robot_file_map.p", "wb")) @@ -422,6 +406,11 @@ def unpickle_robot_file_map(index_dir): return pickle.load(open(index_dir + "/robot_file_map.p", "rb")) +def load_expired_urls(): + expired_pages = Page.raw("SELECT DISTINCT p.url FROM page as p JOIN crawl as c ON p.id == c.page_id WHERE datetime(c.timestamp, REPLACE('fnord hours', 'fnord', p.change_frequency)) < datetime('now');") + return [page.url for page in expired_pages.execute()] + + def load_seed_request_urls(): with open("seed-requests.txt") as f: content = f.readlines() @@ -497,8 +486,6 @@ def recrawl_feeds(): db = init_db(f"{index_dir}/{constants.DB_FILENAME}") global max_crawl_depth max_crawl_depth = 0 - global visited_urls - visited_urls = [] global robot_file_map robot_file_map = unpickle_robot_file_map(INDEX_DIR_CURRENT) global domain_hit_timings @@ -506,8 +493,7 @@ def recrawl_feeds(): seed_resources = [GeminiResource(url) for url in content_urls] for resource in seed_resources: - crawl(resource, 0) - rebuild_link_table() + crawl_page(resource, 0) pickle_robot_file_map(robot_file_map, index_dir) print(content_urls) @@ -523,8 +509,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global db db = init_db(f"{index_dir}/{constants.DB_FILENAME}") - global visited_urls - visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) global robot_file_map robot_file_map = {} if should_run_destructive else unpickle_robot_file_map(INDEX_DIR_CURRENT) global domain_hit_timings @@ -532,18 +516,12 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global max_crawl_depth max_crawl_depth = -1 - seed_urls.extend(SEED_URLS) - seed_resources = [GeminiResource(url) for url in seed_urls] - for resource in seed_resources: - crawl(resource, 0) - - # after full crawl, crawl the seed requests as well in case there is - # anything new - seed_request_urls = load_seed_request_urls() - seed_request_resources = [GeminiResource(url) for url in seed_request_urls] - for resource in seed_request_resources: - crawl(resource, 0) - rebuild_link_table() + expired_resources = [GeminiResource(url) for url in load_expired_urls()] + for resource in expired_resources: + crawl_page(resource, 0, should_check_if_expired=False) + submitted_resources = [GeminiResource(url) for url in load_seed_request_urls()] + for resource in submitted_resources: + crawl_page(resource, 0, should_check_if_expired=True) pickle_robot_file_map(robot_file_map, index_dir) print("Finished!") diff --git a/gus/lib/db_model.py b/gus/lib/db_model.py @@ -14,7 +14,7 @@ def init_db(filename=":memory:"): """ Bind an SQLite database to the Peewee ORM models. """ - models = [Page, Link] + models = [Page, Link, Crawl] db = SqliteDatabase(filename) db.bind(models) db.create_tables(models) @@ -37,7 +37,7 @@ class Page(Model): content = TextField(null=True) prompt = TextField(null=True) size = IntegerField(null=True) # in bytes - indexed_at = DateTimeField(null=True) + change_frequency = IntegerField(null=True) # in hours class Link(Model): """ @@ -46,3 +46,13 @@ class Link(Model): from_page = ForeignKeyField(Page, backref="outbound_links") to_page = ForeignKeyField(Page, backref="backlinks") + +class Crawl(Model): + """ + Attempts to crawl a page. + """ + + page = ForeignKeyField(Page, backref="crawls") + status = IntegerField() + is_different = BooleanField() + timestamp = DateTimeField() diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -11,6 +11,9 @@ from gus.lib.domain import is_domain uses_relative.append("gemini") uses_netloc.append("gemini") +LOG_LIKE_PATTERN = re.compile(".*/(gemlog|glog|starlog|pikkulog)/?$") +ROOT_LIKE_PATTERN = re.compile("^/(~[a-z][-a-z0-9]*|users/[a-z][-a-z0-9]*|users)/?$") + class GeminiRobotFileParser(RobotFileParser): def set_url(self, url): """Sets the URL referring to a robots.txt file.""" @@ -46,6 +49,8 @@ class GeminiResource(): self._normalized_host = None self._fetchable_url = None self._indexable_url = None + self._is_root_like = None + self._is_log_like = None self.contained_resources = None def urlsplit_featureful(url, fully_qualified_parent_url=None, parent_hostname=None): @@ -137,6 +142,25 @@ class GeminiResource(): self._indexable_url = indexable_url return self._indexable_url + + def _get_is_root_like(self): + if self._is_root_like is None: + is_root_like = False + if self.urlsplit.path == "" or self.urlsplit.path == "/" or ROOT_LIKE_PATTERN.match(self.urlsplit.path): + is_root_like = True + self._is_root_like = is_root_like + return self._is_root_like + + + def _get_is_log_like(self): + if self._is_log_like is None: + is_log_like = False + if self.urlsplit.path == "" or self.urlsplit.path == "/" or LOG_LIKE_PATTERN.match(self.urlsplit.path): + is_log_like = True + self._is_log_like = is_log_like + return self._is_log_like + + # constructed from fetchable_url # does not matter if quoted or unquoted so I choose arbitrarily to # standardize on unquoting it. @@ -148,6 +172,8 @@ class GeminiResource(): # constructed from fetchable_url # should be unquoted. indexable_url = property(_get_indexable_url) + is_root_like = property(_get_is_root_like) + is_log_like = property(_get_is_log_like) def fetch(self): # NB: this intentionally does NOT fetch the normalized URL, because that could