geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 009873a26d4cda643e16db159e12a96c0633282a
parent 59db14509562934de4e70f38d9038646ed0dc5e5
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Sun, 24 May 2020 07:28:15 -0400

[crawl] Handle url fragments

Up to this point, fragments weren't being handled at all, so links to
two different fragments on the same page would both get indexed as
distinct results. With this change, we now strip fragments so the only
thing that ends up in the index is the fragmentless-URL one time.

Diffstat:
Mgus/lib/gemini.py | 9++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -108,7 +108,11 @@ class GeminiResource(): return None if self._fully_qualified_url is None: if self.is_relative: - url = urlunsplit(self.urlsplit) + # leave of fragment portion of urlsplit at [4] + urlsplit_parts = list(self.urlsplit[:4]) + urlsplit_parts.append("") + + url = urlunsplit(urlsplit_parts) else: raw_url_lower = self.raw_url.lower() if raw_url_lower.startswith("gemini://"): @@ -117,6 +121,9 @@ class GeminiResource(): url = "gemini{}".format(self.raw_url) else: url = "gemini://{}".format(self.raw_url) + # leave of fragment portion of urlsplit at [4] + if self.urlsplit[4] != "": + url = url.replace("#{}".format(self.urlsplit[4]), "") self._fully_qualified_url = url return self._fully_qualified_url