geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 134b7f6c482cdfda95eae80c5d83ae1712fbacea
parent 64748f085254199e38203f84844201ac88aa27e7
Author: René Wagner <rwagner@rw-net.de>
Date:   Fri, 12 Feb 2021 08:05:34 +0100

correctly handle robots.txt

Honor the robots.txt entrys of "indexer" and "gus" as well
as the default * section.

The robot_file_map.p must be deleted on a live instance
after this change has been applied to refetch all robots
files, as previously only empty files have been stored.

Diffstat:
Adocs/handling-robots.md | 14++++++++++++++
Mgus/crawl.py | 20++++++++++++--------
2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/docs/handling-robots.md b/docs/handling-robots.md @@ -0,0 +1,14 @@ +# robots.txt handling + +robots.txt is fetched for each (sub)domain before actually crawling the content. + +GUS honors the following User-agents: +* indexer +* gus +* * + +## robots.txt caching + +Every fetched robots.txt is cached in `index/robot_file_map.p`, even if they were empty/missing. + +To force a refetch of _all_ robots.txt for _all_ capsulses, simply delete the file named above and run a crawl. diff --git a/gus/crawl.py b/gus/crawl.py @@ -438,6 +438,7 @@ def fetch_robots_file(robot_host): ) rp = GeminiRobotFileParser(robot_url) rp.read() + return rp def get_robots_file(robot_host): @@ -489,12 +490,15 @@ def crawl_page( robots_file = get_robots_file(gr.normalized_host) crawl_delay = None if robots_file is not None: - # keep overwriting the value of can_fetch with more specific user-agent values - # last one should win, and if not present, RobotFileParser will just return - # the higher level's value again - can_fetch = robots_file.can_fetch("*", gr.normalized_url) - can_fetch = robots_file.can_fetch("indexer", gr.normalized_url) - can_fetch = robots_file.can_fetch("gus", gr.normalized_url) + logging.debug("Found robots.txt for %s", gr.normalized_url) + # only fetch if both user-agents are allowed to fetch + # RobotFileParser will return the higher level value (*) if no specific + # value is found, but has no understanding the "gus" is a more specific + # form of an indexer + logging.debug("can_fetch indexer: %s",robots_file.can_fetch("indexer", gr.normalized_url)) + logging.debug("can_fetch gus: %s",robots_file.can_fetch("gus", gr.normalized_url)) + can_fetch = (robots_file.can_fetch("indexer", gr.normalized_url) and + robots_file.can_fetch("gus", gr.normalized_url)) # same approach as above - last value wins crawl_delay = robots_file.crawl_delay("*") @@ -502,8 +506,8 @@ def crawl_page( crawl_delay = robots_file.crawl_delay("gus") if not can_fetch: - logging.debug( - "Blocked by robots files, skipping: %s", + logging.info( + "Blocked by robots.txt, skipping: %s", gus.lib.logging.strip_control_chars(url), ) return