geminispace.info

Unnamed repository; edit this file 'description' to name the repository.
git clone git://code.clttr.info/geminispace.info.git
Log | Files | Refs | README | LICENSE

commit e2ec756bbdcae99d21e7f7157f9e6fb306981dc3
parent ca95ee48b9c2bfb317202c2500e00b178c9b6666
Author: René Wagner <rwa@clttr.info>
Date:   Thu, 26 May 2022 19:42:23 +0200

move data deletion to indexing

Diffstat:
Mgus/build_index.py | 45++++++++++++++++++++++++++++++++++++++++-----
Mgus/crawl.py | 30------------------------------
Mgus/lib/search.py | 2+-
3 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/gus/build_index.py b/gus/build_index.py @@ -86,6 +86,44 @@ def build_index(should_run_destructive=False): db = init_db(f"{index_dir}/{constants.DB_FILENAME}") index = search.Index(index_dir, should_run_destructive) + # delete pages that never successfull crawled + count=0 + q = Page.select().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) + for page in q.iterator(): + try: + index.delete_by_term("url_id", page.url) + page.delete_instance() + count+=1 + except Exception as e: + logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) + + logging.warn("Deleted %d rows without successfull crawl", count) + + # delete pages with last crawl success older than 30 days which have been recrawled since than + # this avoids deletion of files that have a change_frequency longer than our timeout + #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + #try: + # domains = q.execute() + # for del_domain in domains: + # logging.warn("Deleting pages for domain: %s", del_domain.domain) + # # Page.delete().where(Page.domain = domain) + #except Exception as e: + # logging.error("Failed to delete domains with outdated successful crawl: %s", e) + + # delete pages with last crawl success older than 30 days which have been recrawled since than + # this avoids deletion of files that have a change_frequency longer than our timeout + count=0 + q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) + for page in q.iterator(): + try: + index.delete_by_term("url_id", page.url) + page.delete_instance() + count+=1 + except Exception as e: + logging.error("Failed to delete row %s with outdated successful crawl: %s", page.id, e) + + logging.warn("Deleted %d rows with outdated successful crawl", count) + if (should_run_destructive): pages = Page.raw( """SELECT p.* FROM page AS p @@ -105,17 +143,14 @@ OR (p.content_type LIKE 'text/%' AND p.size <= ?))""", constants.MAXIMUM_TEXT_PA for page in pages.iterator(): index_page(index, page) + page.indexed_at = datetime.utcnow() + page.save() try: logging.info("Commiting search index...") index.close() logging.info("Updating raw data...") - timestamp = datetime.utcnow() - for page in pages.iterator(): - page.indexed_at = timestamp; - page.save() - except Exception as e: logging.error('Closing of index failed: %s', e); diff --git a/gus/crawl.py b/gus/crawl.py @@ -545,36 +545,6 @@ def run_crawl(should_run_destructive=False, seed_urls=[]): global max_crawl_depth max_crawl_depth = 500 - # delete pages that never successfull crawled - q = Page.delete().where(Page.last_crawl_success_at.is_null(True) & Page.last_crawl_at.is_null(False)) - try: - count = q.execute() - if count > 0: - logging.warn("Deleted %d rows without successfull crawl", count) - except Exception as e: - logging.error("Failed to delete rows without successfull crawl: %s", e) - - # delete pages with last crawl success older than 30 days which have been recrawled since than - # this avoids deletion of files that have a change_frequency longer than our timeout - #q = Page.select().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) - #try: - # domains = q.execute() - # for del_domain in domains: - # logging.warn("Deleting pages for domain: %s", del_domain.domain) - # # Page.delete().where(Page.domain = domain) - #except Exception as e: - # logging.error("Failed to delete domains with outdated successful crawl: %s", e) - - # delete pages with last crawl success older than 30 days which have been recrawled since than - # this avoids deletion of files that have a change_frequency longer than our timeout - q = Page.delete().where((Page.last_crawl_at > Page.last_crawl_success_at) & (Page.last_crawl_success_at < (datetime.now() + timedelta(days=-30)))) - try: - count = q.execute() - if count > 0: - logging.warn("Deleted %d rows with outdated successful crawl", count) - except Exception as e: - logging.error("Failed to delete rows with outdated successful crawl: %s", e) - global failure_count failure_count = {} diff --git a/gus/lib/search.py b/gus/lib/search.py @@ -82,7 +82,7 @@ class Index: def add_document(self, document): self._rolling_writer().update_document(**document) - def delete_by_term(self, key, val): # TODO delete_document + def delete_by_term(self, key, val): self._rolling_writer().delete_by_term(key, val, searcher=None) def parse_query(self, query):