geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit d96abf705580f8fece03e9dc9440c94e02bf8e66
parent 941b086b7d7e8b8e1e16bd92fd80c5cc17d1650b
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Thu, 21 May 2020 10:59:33 -0400

[crawl] Add domain field to index

Diffstat:
Mgus/crawl.py | 6++++++
1 file changed, 6 insertions(+), 0 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -124,6 +124,9 @@ def create_index(index_dir): stored=True, analyzer=UrlAnalyzer(), ), + domain=TEXT( + analyzer=UrlAnalyzer(), + ), content_type=TEXT( stored=True, ), @@ -149,6 +152,7 @@ def index_binary(resource): try: index_writer.add_document( url=resource.fully_qualified_massaged_url, + domain=resource.normalized_host, content_type=resource.response.content_type, indexed_at=datetime.utcnow(), ) @@ -163,6 +167,7 @@ def index_prompt(resource): try: index_writer.add_document( url=resource.fully_qualified_massaged_url, + domain=resource.normalized_host, content_type="input", prompt=resource.response.prompt, indexed_at=datetime.utcnow(), @@ -178,6 +183,7 @@ def index_content(resource): try: index_writer.add_document( url=resource.fully_qualified_massaged_url, + domain=resource.normalized_host, content_type=resource.response.content_type, content=resource.response.content, indexed_at=datetime.utcnow(),