geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit 35176988fc825f13131be2c3f573698675c09f89
parent 370e53eabbc7649f4ac6e424f46efebebed4b8f8
Author: Natalie Pendragon <natpen@natpen.net>
Date:   Mon, 25 May 2020 06:31:14 -0400

[crawl] Improve handling of quoting and unquoting URLs

Before everything got unquoted at the very beginning of GeminiResource
instantiation. This was slightly errant. It was fine for the
normalized_url and the indexable_url, but resulted in fetchable_url
not being sent quoted, which it should be.

Diffstat:
Mgus/lib/gemini.py | 12+++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/gus/lib/gemini.py b/gus/lib/gemini.py @@ -34,7 +34,6 @@ class GeminiRobotFileParser(RobotFileParser): class GeminiResource(): def __init__(self, url, fully_qualified_parent_url=None, parent_hostname=None): - url = unquote(url) self.raw_url = url self.urlsplit, self.is_relative = GeminiResource.urlsplit_featureful( url, @@ -132,15 +131,22 @@ class GeminiResource(): if not self.is_valid: return None if self._indexable_url is None: - indexable_url = self.fetchable_url + indexable_url = unquote(self.fetchable_url) if self.urlsplit.port == 1965: indexable_url = self.normalized_url.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) self._indexable_url = indexable_url return self._indexable_url + # constructed from fetchable_url + # does not matter if quoted or unquoted so I choose arbitrarily to + # standardize on unquoting it. normalized_url = property(_get_normalized_url) normalized_host = property(_get_normalized_host) + # constructed from urlsplit or raw_url + # should be quoted. fetchable_url = property(_get_fetchable_url) + # constructed from fetchable_url + # should be unquoted. indexable_url = property(_get_indexable_url) def fetch(self): @@ -151,7 +157,7 @@ class GeminiResource(): def _get_normalized_url_and_host(self): - url_normalized = self.fetchable_url.lower().rstrip("/") + url_normalized = unquote(self.fetchable_url.lower().rstrip("/")) if self.urlsplit.port == 1965: url_normalized = url_normalized.replace(self.urlsplit.hostname.lower() + ":1965", self.urlsplit.hostname.lower(), 1) host_normalized = self.urlsplit.hostname.lower()