geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 8ce3f4d55921e668a413db6242496e8f3e30dab3
parent ffcba3395182761ec36834e566a2fac4c20df0c5
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Tue, 12 May 2020 07:56:30 -0400

[statistics] Add ability to compute and print stats easily

Diffstat:
Mgus/lib/index_statistics.py | 33++++++++++++++++++++++++++++-----
Mpyproject.toml | 1+
2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/gus/lib/index_statistics.py b/gus/lib/index_statistics.py @@ -10,14 +10,17 @@ from gus.lib.url_helpers import normalize_gemini_url def compute_index_statistics(index_dir): ix = open_dir(index_dir) + + # content types content_types = None with ix.reader() as reader: - # all_terms = reader.most_frequent_terms("content_type", number=999) - all_terms = reader.all_stored_fields() - content_types = set([term["content_type"] for term in all_terms]) - content_type_frequencies = [] - page_count = 0 + all_stored_fields = reader.all_stored_fields() + content_types = set([f["content_type"] for f in all_stored_fields]) + + # page count, domain count, content type frequencies page_count = 0 + domain_count = 0 + content_type_frequencies = [] with ix.searcher() as searcher: page_count = searcher.doc_count() @@ -36,6 +39,8 @@ def compute_index_statistics(index_dir): _, domain = normalize_gemini_url(result["url"]) domains.add(domain) domain_count = len(domains) + + # index modification time index_modification_time = datetime.fromtimestamp(os.path.getmtime(index_dir)) return { "index_modification_time": index_modification_time, @@ -46,6 +51,24 @@ def compute_index_statistics(index_dir): } +def print_index_statistics(index_statistics): + print("Index generation date : {:%Y-%m-%d}".format(index_statistics["index_modification_time"])) + print("Page Count : {:>10}".format(index_statistics["page_count"])) + print("Domain Count : {:>10}".format(index_statistics["domain_count"])) + print("\nContent Types:") + for pair in index_statistics["content_type_frequencies"]: + print("{:>5} - {}".format(pair[1], pair[0])) + print("\nDomains:") #.format(index_statistics["domains"])) + for domain in index_statistics["domains"]: + print("- {}".format(domain)) + + +def run_index_statistics(): + index_statistics = compute_index_statistics("index") + print_index_statistics(index_statistics) + # persist_index_statistics(index_statistics, "index-statistics.csv") + + def persist_index_statistics(index_statistics, filename): with open(filename, "a") as f: f.write(serialize_index_statistics_line(index_statistics)) diff --git a/pyproject.toml b/pyproject.toml @@ -24,3 +24,4 @@ build-backend = "poetry.masonry.api" crawl = "gus.crawl:main" search_index = "gus.search_index:main" serve = "gus.serve:main" +statistics = "gus.lib.index_statistics:run_index_statistics"