geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 0bbf43c49ba5cccbc26d346283d66a2651261a6d
parent ffd444663634ff94298ae07c6bbd943c8fdec880
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu,  4 Jun 2020 11:27:51 -0400

[crawl] Improve indexing performance

I was getting index out of bound issues for optimize calls before this
change - and when looking in the index/ dirs, there were over 30
thousand files. I think this caused issues with whoosh, so now I am
waiting to commit all the writes to the end of the crawl. It's more
unfortunate if the crawl dies, but c'est la vie. On the plus side, now
the optimize call is no longer really even necessary since the final
product is only a few index segments.

Diffstat:
Mgus/crawl.py | 76+++++++++++++++++++++++++++++++---------------------------------------------
1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -168,56 +168,42 @@ def create_index(index_dir): def index_binary(resource, response): print("INDEXING BINARY...") - index_writer = index_storage.open_index().writer() - try: - index_writer.add_document( - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type=response.content_type, - charset=response.charset or "none", - indexed_at=datetime.utcnow(), - ) - index_writer.commit() - except: - index_writer.cancel() + + index_writer.add_document( + url=resource.indexable_url, + fetchable_url=resource.fetchable_url, + domain=resource.normalized_host, + content_type=response.content_type, + charset=response.charset or "none", + indexed_at=datetime.utcnow(), + ) def index_prompt(resource, response): print("INDEXING PROMPT...") - index_writer = index_storage.open_index().writer() - try: - index_writer.add_document( - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type="input", - charset=response.charset or "none", - prompt=response.prompt, - indexed_at=datetime.utcnow(), - ) - index_writer.commit() - except: - index_writer.cancel() + index_writer.add_document( + url=resource.indexable_url, + fetchable_url=resource.fetchable_url, + domain=resource.normalized_host, + content_type="input", + charset=response.charset or "none", + prompt=response.prompt, + indexed_at=datetime.utcnow(), + ) def index_content(resource, response): print("INDEXING CONTENT...") - index_writer = index_storage.open_index().writer() - try: - index_writer.add_document( - url=resource.indexable_url, - fetchable_url=resource.fetchable_url, - domain=resource.normalized_host, - content_type=response.content_type, - charset=response.charset or "none", - content=response.content, - regex=response.content, - indexed_at=datetime.utcnow(), - ) - index_writer.commit() - except: - index_writer.cancel() + index_writer.add_document( + url=resource.indexable_url, + fetchable_url=resource.fetchable_url, + domain=resource.normalized_host, + content_type=response.content_type, + charset=response.charset or "none", + content=response.content, + regex=response.content, + indexed_at=datetime.utcnow(), + ) def get_robots_file(robot_host): @@ -380,6 +366,8 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): if should_run_destructive: backup_old_index(INDEX_DIR_CURRENT, INDEX_DIR_BACKUP) create_index(index_dir) + global index_writer + index_writer = index_storage.open_index().writer() global visited_urls visited_urls = [] if should_run_destructive else load_visited_urls(INDEX_DIR_CURRENT) @@ -406,12 +394,10 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): seed_request_resources = [GeminiResource(url) for url in seed_request_urls] for resource in seed_request_resources: crawl(resource) + index_writer.commit() pickle_robot_file_map(robot_file_map, index_dir) - ix = open_dir(index_dir) - ix.optimize() - index_statistics = compute_index_statistics(index_dir) print_index_statistics(index_statistics, crawl_statistics) persist_statistics(index_statistics, crawl_statistics, should_run_destructive, "statistics.csv")