diff --git a/hivemind_etl/website/website_etl.py b/hivemind_etl/website/website_etl.py index 8de8397..0f2853c 100644 --- a/hivemind_etl/website/website_etl.py +++ b/hivemind_etl/website/website_etl.py @@ -1,3 +1,4 @@ +import logging from typing import Any from hivemind_etl.website.crawlee_client import CrawleeClient @@ -47,7 +48,13 @@ async def extract( """ if not urls: raise ValueError("No URLs provided for crawling") - extracted_data = await self.crawlee_client.crawl(urls) + + extracted_data = [] + for url in urls: + logging.info(f"Crawling {url} and its routes!") + extracted_data.extend(await self.crawlee_client.crawl(links=[url])) + + logging.info(f"Extracted {len(extracted_data)} documents!") if not extracted_data: raise ValueError(f"No data extracted from URLs: {urls}") diff --git a/tests/unit/test_website_etl.py b/tests/unit/test_website_etl.py index 412aa2f..e1a6e92 100644 --- a/tests/unit/test_website_etl.py +++ b/tests/unit/test_website_etl.py @@ -34,7 +34,7 @@ async def test_extract(self): extracted_data = await self.website_etl.extract(urls) self.assertEqual(extracted_data, mocked_data) - self.website_etl.crawlee_client.crawl.assert_awaited_once_with(urls) + self.website_etl.crawlee_client.crawl.assert_awaited_once_with(links=urls) def test_transform(self): """