geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit e4b2ef0192c7d75583f6a417c260585566b5125f
parent a3fef86b23fab26e13a97c281ef55f22cd444a19
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon,  6 Jul 2020 06:22:01 -0400

Make incremental build_index work

Some of the idempotency was lost during the shuffle to split the crawl
into two phases.

Diffstat:
Mgus/build_index.py | 18+++++++++++++++---
Mgus/lib/index_statistics.py | 16++++++++--------
2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -108,7 +108,9 @@ def create_index(index_dir): index_storage.create_index(schema) -def index_page(page): +def index_page(page, indexed_urls): + if page.fetchable_url in indexed_urls: + return print(page.url) url = page.url.rstrip("/") from_page = Page.alias() @@ -138,6 +140,15 @@ def index_page(page): index_writer.add_document(**document) +def load_indexed_urls(index_dir): + indexed_urls = [] + ix = open_dir(index_dir) + with ix.reader() as reader: + all_stored_fields = reader.all_stored_fields() + # TODO: change this (back) to normalized url + # indexed_urls = [GeminiResource(f["url"]).normalized_url for f in all_stored_fields] + indexed_urls = [f["fetchable_url"] for f in all_stored_fields] + return indexed_urls def build_index(should_run_destructive=False): @@ -154,13 +165,14 @@ def build_index(should_run_destructive=False): ix = index_storage.open_index() global index_writer index_writer = ix.writer() + indexed_urls = [] if should_run_destructive else load_indexed_urls(INDEX_DIR_CURRENT) pages = Page.select().where(Page.indexed_at.is_null(False)) for page in pages.iterator(): - index_page(page) + index_page(page, indexed_urls) index_writer.commit() - index_statistics = compute_index_statistics(index_dir) + index_statistics = compute_index_statistics(db) print_index_statistics(index_statistics) persist_statistics(index_statistics, None, should_run_destructive, "statistics.csv") # if should_run_destructive: diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -63,12 +63,12 @@ def print_index_statistics(index_statistics, crawl_statistics=None): # print("- {}".format(domain)) print("\nContent Types:") - for pair in index_statistics["content_type_frequencies"]: - print("{:>5} - {}".format(pair[1], pair[0])) + for entry in index_statistics["content_type_frequencies"]: + print("{:>5} - {}".format(entry["count"], entry["content_type"])) print("\nCharsets:") - for pair in index_statistics["charset_frequencies"]: - print("{:>5} - {}".format(pair[1], pair[0])) + for entry in index_statistics["charset_frequencies"]: + print("{:>5} - {}".format(entry["count"], entry["charset"])) def run_index_statistics(): @@ -92,8 +92,8 @@ def serialize_statistics_line(index_statistics, crawl_statistics, was_destructiv crawl_statistics["redirect_nontrivial_count"] if crawl_statistics else 0, crawl_statistics["broken_url_count"] if crawl_statistics else 0, "|".join(index_statistics["domains"]), - "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["content_type_frequencies"]), - "|".join("{}:{}".format(pair[0], pair[1]) for pair in index_statistics["charset_frequencies"]), + "|".join("{}:{}".format(entry["content_type"], entry["count"]) for entry in index_statistics["content_type_frequencies"]), + "|".join("{}:{}".format(entry["charset"], entry["count"]) for entry in index_statistics["charset_frequencies"]), ) @@ -124,8 +124,8 @@ def deserialize_statistics_line(line): redirect_nontrivial_count = line_parts[5] broken_url_count = line_parts[6] domains = [domain for domain in line_parts[7].split("|")] - content_type_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[8].split("|")] - charset_frequencies = [(pair.split(":")[0], pair.split(":")[1]) for pair in line_parts[9].split("|")] + content_type_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[8].split("|")] + charset_frequencies = [(entry.split(":")[0], entry.split(":")[1]) for entry in line_parts[9].split("|")] return { "index_modification_time": index_modification_time,