Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@ POSTGRES_DB_PORT=
POSTGRES_PWD=
POSTGRES_SEEDS=
POSTGRES_USER=

MEDIAWIKI_PROXY_URL=
10 changes: 9 additions & 1 deletion hivemind_etl/mediawiki/etl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
import shutil

from llama_index.core import Document
Expand All @@ -15,7 +16,14 @@ def __init__(
delete_dump_after_load: bool = True,
) -> None:
self.community_id = community_id
self.wikiteam_crawler = WikiteamCrawler(community_id, namespaces=namespaces)

self.proxy_url = os.getenv("MEDIAWIKI_PROXY_URL", "")
if self.proxy_url:
logging.info(f"Proxy is set to be used!")

self.wikiteam_crawler = WikiteamCrawler(
community_id, namespaces=namespaces, proxy_url=self.proxy_url
)

self.dump_dir = f"dump_{self.community_id}"
self.delete_dump_after_load = delete_dump_after_load
Expand Down
5 changes: 5 additions & 0 deletions hivemind_etl/mediawiki/wikiteam_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(
force: bool = True,
curonly: bool = True,
namespaces: list[int] = [],
proxy_url: str = "",
**kwargs,
) -> None:
self.community_id = community_id
Expand All @@ -20,6 +21,7 @@ def __init__(
self.curonly = curonly
self.extra_params = kwargs
self.namespaces = namespaces
self.proxy_url = proxy_url

def crawl(self, api_url: str, dump_path: str) -> None:
"""
Expand Down Expand Up @@ -50,6 +52,9 @@ def crawl(self, api_url: str, dump_path: str) -> None:
if self.namespaces:
params.append(f"--namespaces")
params.append(f"{','.join(map(str, self.namespaces))}")
if self.proxy_url:
params.append(f"--proxy")
params.append(self.proxy_url)

# Add any extra parameters passed during initialization
for key, value in self.extra_params.items():
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ defusedxml==0.7.1
pydantic==2.9.2
motor>=3.6, <4.0.0
tc-temporal-backend==1.0.0
wikiteam3==4.4.1
wikiteam3-fork-proxy==1.0.0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Dependency updated to support proxy functionality.

The library has been changed from wikiteam3==4.4.1 to wikiteam3-fork-proxy==1.0.0, which appears to be a fork with added proxy support capabilities.


🏁 Script executed:

#!/bin/bash
# Verify the availability and metadata of the new package
pip show wikiteam3-fork-proxy 2>/dev/null || echo "Package not found in PyPI"

# Try to find information about the package repository
pip install yolk3k -q 2>/dev/null
yolk -M wikiteam3-fork-proxy 2>/dev/null || echo "No metadata available"

Length of output: 200


Dependency Update Verification: Potential Source Configuration Issue

The dependency in requirements.txt (line 11):

wikiteam3-fork-proxy==1.0.0

has been updated to support proxy functionality. However, our verification revealed that the package is not available on PyPI and no metadata could be retrieved. Please confirm if this forked dependency is intended to be installed from a non-PyPI source (such as a custom repository or a direct VCS reference). If so, consider updating the dependency specification (or the installation instructions) accordingly to ensure that users can correctly obtain the package.