diff --git a/.env.example b/.env.example index 21c139f..c719ba4 100644 --- a/.env.example +++ b/.env.example @@ -27,3 +27,5 @@ POSTGRES_DB_PORT= POSTGRES_PWD= POSTGRES_SEEDS= POSTGRES_USER= + +MEDIAWIKI_PROXY_URL= \ No newline at end of file diff --git a/hivemind_etl/mediawiki/etl.py b/hivemind_etl/mediawiki/etl.py index 3b45dde..1acc223 100644 --- a/hivemind_etl/mediawiki/etl.py +++ b/hivemind_etl/mediawiki/etl.py @@ -1,4 +1,5 @@ import logging +import os import shutil from llama_index.core import Document @@ -15,7 +16,14 @@ def __init__( delete_dump_after_load: bool = True, ) -> None: self.community_id = community_id - self.wikiteam_crawler = WikiteamCrawler(community_id, namespaces=namespaces) + + self.proxy_url = os.getenv("MEDIAWIKI_PROXY_URL", "") + if self.proxy_url: + logging.info(f"Proxy is set to be used!") + + self.wikiteam_crawler = WikiteamCrawler( + community_id, namespaces=namespaces, proxy_url=self.proxy_url + ) self.dump_dir = f"dump_{self.community_id}" self.delete_dump_after_load = delete_dump_after_load diff --git a/hivemind_etl/mediawiki/wikiteam_crawler.py b/hivemind_etl/mediawiki/wikiteam_crawler.py index 002a10c..d26900f 100644 --- a/hivemind_etl/mediawiki/wikiteam_crawler.py +++ b/hivemind_etl/mediawiki/wikiteam_crawler.py @@ -12,6 +12,7 @@ def __init__( force: bool = True, curonly: bool = True, namespaces: list[int] = [], + proxy_url: str = "", **kwargs, ) -> None: self.community_id = community_id @@ -20,6 +21,7 @@ def __init__( self.curonly = curonly self.extra_params = kwargs self.namespaces = namespaces + self.proxy_url = proxy_url def crawl(self, api_url: str, dump_path: str) -> None: """ @@ -50,6 +52,9 @@ def crawl(self, api_url: str, dump_path: str) -> None: if self.namespaces: params.append(f"--namespaces") params.append(f"{','.join(map(str, self.namespaces))}") + if self.proxy_url: + params.append(f"--proxy") + params.append(self.proxy_url) # Add any extra parameters passed during initialization for key, value in self.extra_params.items(): diff --git a/requirements.txt b/requirements.txt index fdfbd5b..6bdc60d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ defusedxml==0.7.1 pydantic==2.9.2 motor>=3.6, <4.0.0 tc-temporal-backend==1.0.0 -wikiteam3==4.4.1 +wikiteam3-fork-proxy==1.0.0