geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit af967cc728bc97af217c8813ecb1156b80d30109
parent 39edf728476cc455ee5d076b9969a6de35e4924c
Author: René Wagner <rwagner@rw-net.de>
Date:   Mon,  8 Feb 2021 17:43:19 +0100

add some forbidden URIs & set max_crawl_depth

Diffstat:
Mgus/crawl.py | 60++++++++++++++++++++++++++++++++++++++----------------------
1 file changed, 38 insertions(+), 22 deletions(-)

diff --git a/gus/crawl.py b/gus/crawl.py @@ -40,15 +40,35 @@ EXCLUDED_URL_PREFIXES = [ "gemini://localhost", "gemini://example.org", "gemini://example.com", + "gemini://www.youtube.com/", "gemini://gemini.conman.org/test", "gemini://gemini.circumlunar.space/users/fgaz/calculator/", - "gemini://gemini.bortzmeyer.org/rfc-mirror/", + # all combinations of a tictactoe board + "gemini://tictactoe.lanterne.chilliet.eu", + + # serving big files and slooow capsule -> takes to long to crawl + "gemini://kamalatta.ddnss.de/", + + # ASCII art with emulated modem speed + "gemini://ansi.hrtk.in/", + "gemini://matrix.kiwifarms.net", + + # ZachDeCooks songs + "gemini://songs.zachdecook.com/song.gmi.php/", + "gemini://songs.zachdecook.com/chord.svg/", + + # OmarPolos BSD ports + 'gemini://gemini.omarpolo.com/cgi/gempkg', + + # breaks crawl due to recursion overflow + "gemini://cadence.moe/chapo/", + "gemini://nixo.xyz/reply/", "gemini://nixo.xyz/notify", - "gemini://blah.com/", "gemini://gemini.thebackupbox.net/queryresponse", "gemini://gem.garichankar.com/share_audio", + # various failing resources on runjimmyrunrunyoufuckerrun.com "gemini://runjimmyrunrunyoufuckerrun.com/fonts/", "gemini://runjimmyrunrunyoufuckerrun.com/tmp/", @@ -62,12 +82,12 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gus.guru/backlinks?", "gemini://gus.guru/threads", - "gemini://geminispace.info/search", - "gemini://geminispace.info/v/search", - "gemini://geminispace.info/search", - "gemini://geminispace.info/v/search", - "gemini://geminispace.info/add-seed", - "gemini://geminispace.info/backlinks", + "gemini://geminispace.info/search/", + "gemini://geminispace.info/v/search/", + "gemini://geminispace.info/search?", + "gemini://geminispace.info/v/search?", + "gemini://geminispace.info/add-seed?", + "gemini://geminispace.info/backlinks?", "gemini://geminispace.info/threads", # Houston "gemini://houston.coder.town/search?", @@ -134,6 +154,9 @@ EXCLUDED_URL_PREFIXES = [ # youtube mirror "gemini://pon.ix.tc/cgi-bin/youtube.cgi?", + # guardian mirror + "gemini://guardian.shit.cx/", + # wikipedia proxy "gemini://wp.pitr.ca/", "gemini://wp.glv.one/", @@ -141,7 +164,7 @@ EXCLUDED_URL_PREFIXES = [ # client torture test "gemini://egsam.pitr.ca/", "gemini://egsam.glv.one/", - + # mozz's chat "gemini://chat.mozz.us/stream", "gemini://chat.mozz.us/submit", @@ -163,6 +186,9 @@ EXCLUDED_URL_PREFIXES = [ "gemini://gemini.thebackupbox.net/radio", "gemini://higeki.jp/radio", + # list of ~30000 stations, crawling takes too long + "gemini://gemini.tunerapp.org/stations/", + # this page inexplicably breaks both build_index, as well as elpher # when I browse to it... I think it might have some weird encoding # issues in its content or something, but that's a problem for a @@ -176,21 +202,11 @@ EXCLUDED_URL_PREFIXES = [ # killing crawl, I think maybe because it's too big "gemini://gem.denarii.cloud/pichaindata.zip", - "gemini://matrix.kiwifarms.net", - # these threads seem to expire "gemini://dioskouroi.xyz/thread", # french news mirrors, there's just too much "gemini://jpfox.fr/rss/", - - # ZachDeCooks songs - "gemini://songs.zachdecook.com/song.gmi.php/", - "gemini://songs.zachdecook.com/chord.svg/", - - # robots.txt not served correctly - "gemini://orrg.clttr.info/orrg.pl", - "gemini://gmndemo.clttr.info/orrg/orrg.pl", ] EXCLUDED_URL_PATHS = [ @@ -500,7 +516,7 @@ def crawl_page( ) elif not crawl_delay: next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( - milliseconds=500 + milliseconds=300 ) else: next_allowed_hit = domain_hit_timings[gr.normalized_host] + timedelta( @@ -806,7 +822,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): pathlib.Path(index_dir).mkdir(parents=True, exist_ok=True) global db db = init_db(f"{index_dir}/{constants.DB_FILENAME}") - + global robot_file_map robot_file_map = ( {} if should_run_destructive else unpickle_robot_file_map(constants.INDEX_DIR) @@ -814,7 +830,7 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global domain_hit_timings domain_hit_timings = {} global max_crawl_depth - max_crawl_depth = -1 + max_crawl_depth = 100 expired_resources = [GeminiResource(url) for url in load_expired_urls()] for resource in expired_resources: