commit 90748b2d3bf999f26a7db26ecfee0abe843b3f4f Author: yakenator Date: Mon Feb 23 13:50:47 2026 +0900 feat: stock news crawler service - News crawling from Korean financial news sources - REST API with health, streams, and trigger endpoints - Redis Streams for async job processing - Docker support with multi-stage build Co-Authored-By: Claude Opus 4.6 diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..daf0201 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +__pycache__ +*.pyc +.git +.venv +*.egg-info +dist +build +.env diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3ebf17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +dist/ +build/ +.eggs/ +.venv/ +venv/ +.env +.vscode/ +.idea/ +*.swp +*.swo +.DS_Store +.pytest_cache/ +htmlcov/ +.coverage +coverage.xml +.mypy_cache/ +.claude/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3542a5c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.12-slim + +WORKDIR /app + +COPY stock-common/ /tmp/stock-common/ +RUN pip install --no-cache-dir /tmp/stock-common/ && rm -rf /tmp/stock-common/ + +COPY stock-news-crawler/ . +RUN pip install --no-cache-dir . + +CMD ["python", "-m", "stock_news_crawler.worker"] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..211030d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "stock-news-crawler" +version = "0.1.0" +description = "Naver Finance news crawler service" +requires-python = ">=3.12" +dependencies = [ + "stock-common", + "beautifulsoup4>=4.12", + "lxml>=5.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/stock_news_crawler"] diff --git a/src/stock_news_crawler/__init__.py b/src/stock_news_crawler/__init__.py new file mode 100644 index 0000000..0fd0c8a --- /dev/null +++ b/src/stock_news_crawler/__init__.py @@ -0,0 +1,3 @@ +"""Naver Finance news crawler service.""" + +__version__ = "0.1.0" diff --git a/src/stock_news_crawler/api.py b/src/stock_news_crawler/api.py new file mode 100644 index 0000000..df86704 --- /dev/null +++ b/src/stock_news_crawler/api.py @@ -0,0 +1,35 @@ +"""REST API for News crawler service. + +Endpoints: + POST /collect/news - trigger news collection + GET /health - (from api_base) + GET /streams - (from api_base) +""" + +from pydantic import BaseModel + +from stock_common.api_base import create_app +from stock_common.queue import publish + +TRIGGER_STREAM = "queue:trigger-news" + + +class NewsRequest(BaseModel): + stock_codes: list[str] + max_pages: int = 3 + + +app = create_app( + title="stock-news-crawler", + streams=[TRIGGER_STREAM, "queue:raw-news"], +) + + +@app.post("/collect/news") +async def collect_news(req: NewsRequest): + msg_id = await publish(TRIGGER_STREAM, { + "type": "news", + "stock_codes": req.stock_codes, + "max_pages": req.max_pages, + }) + return {"status": "queued", "message_id": msg_id, "stock_codes": req.stock_codes} diff --git a/src/stock_news_crawler/crawler.py b/src/stock_news_crawler/crawler.py new file mode 100644 index 0000000..e77aadd --- /dev/null +++ b/src/stock_news_crawler/crawler.py @@ -0,0 +1,104 @@ +"""Naver Finance news crawler.""" + +import asyncio +import contextlib +from datetime import datetime +from typing import Any +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +from stock_common.collector_base import BaseCollector +from stock_common.config import settings +from stock_common.models.disclosure import Sentiment +from stock_common.models.news import News + +NAVER_FINANCE_BASE = "https://finance.naver.com" +NAVER_NEWS_URL = f"{NAVER_FINANCE_BASE}/item/news.naver" + +USER_AGENTS = [ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36", +] + + +class NewsCollector(BaseCollector): + """Web scraper for Naver Finance stock news.""" + + def __init__(self) -> None: + super().__init__(rate_limit=settings.news_rate_limit, timeout=15) + self._ua_index = 0 + self._seen_urls: set[str] = set() + + def _get_user_agent(self) -> str: + ua = USER_AGENTS[self._ua_index % len(USER_AGENTS)] + self._ua_index += 1 + return ua + + async def collect(self, **kwargs: Any) -> None: + raise NotImplementedError("Use collect_news()") + + async def collect_news(self, stock_code: str, max_pages: int = 3) -> list[News]: + articles: list[News] = [] + for page in range(1, max_pages + 1): + page_articles = await self._collect_news_page(stock_code, page) + if not page_articles: + break + articles.extend(page_articles) + await asyncio.sleep(0.5) + self.logger.info("news_collected", stock_code=stock_code, count=len(articles)) + return articles + + async def _collect_news_page(self, stock_code: str, page: int) -> list[News]: + params = {"code": stock_code, "page": str(page)} + headers = {"User-Agent": self._get_user_agent()} + try: + html = await self._request("GET", NAVER_NEWS_URL, params=params, headers=headers) + except Exception as exc: + self.logger.warning("news_page_fetch_error", stock_code=stock_code, error=str(exc)) + return [] + if not isinstance(html, str): + return [] + return self._parse_news_page(html, stock_code) + + def _parse_news_page(self, html: str, stock_code: str) -> list[News]: + soup = BeautifulSoup(html, "lxml") + articles: list[News] = [] + table = soup.select_one("table.type5") + if not table: + return articles + + for row in table.select("tr"): + try: + title_cell = row.select_one("td.title a") + if not title_cell: + continue + title = title_cell.get_text(strip=True) + href = title_cell.get("href", "") + if not href: + continue + url = urljoin(NAVER_FINANCE_BASE, str(href)) + if url in self._seen_urls: + continue + self._seen_urls.add(url) + + source_cell = row.select_one("td.info") + source = source_cell.get_text(strip=True) if source_cell else "naver" + date_cell = row.select_one("td.date") + published_at = None + if date_cell: + date_text = date_cell.get_text(strip=True) + try: + published_at = datetime.strptime(date_text, "%Y.%m.%d %H:%M") + except ValueError: + with contextlib.suppress(ValueError): + published_at = datetime.strptime(date_text, "%Y.%m.%d") + + articles.append(News( + url=url, title=title, stock_codes=[stock_code], + source=source, sentiment=Sentiment.NEUTRAL, + published_at=published_at, collected_at=datetime.now(), + )) + except Exception: + pass + return articles diff --git a/src/stock_news_crawler/worker.py b/src/stock_news_crawler/worker.py new file mode 100644 index 0000000..a7ebfeb --- /dev/null +++ b/src/stock_news_crawler/worker.py @@ -0,0 +1,73 @@ +"""News crawler worker - runs as a standalone service. + +Runs both the Redis stream consumer loop AND the REST API server concurrently. +""" + +import asyncio + +import structlog +import uvicorn + +from stock_common.config import settings +from stock_common.logging_config import configure_logging +from stock_common.queue import consume, publish, ack + +from stock_news_crawler.crawler import NewsCollector +from stock_news_crawler.api import app + +logger = structlog.get_logger(service="news-crawler") + +TRIGGER_STREAM = "queue:trigger-news" +OUTPUT_STREAM = "queue:raw-news" +GROUP = "news-crawlers" +CONSUMER = "news-worker-1" + + +async def worker_loop() -> None: + """Redis stream consumer loop with auto-restart on failure.""" + while True: + try: + async with NewsCollector() as crawler: + while True: + messages = await consume(TRIGGER_STREAM, GROUP, CONSUMER, count=1, block=5000) + + for msg in messages: + stock_codes = msg.data.get("stock_codes", []) + max_pages = msg.data.get("max_pages", 3) + + try: + for code in stock_codes: + articles = await crawler.collect_news(code, max_pages) + for article in articles: + await publish(OUTPUT_STREAM, article.model_dump()) + + await ack(TRIGGER_STREAM, GROUP, msg.message_id) + logger.info("task_completed", stock_count=len(stock_codes)) + except Exception as exc: + logger.error("task_failed", error=str(exc)) + except Exception as exc: + logger.error("worker_loop_crashed", error=str(exc)) + await asyncio.sleep(10) + logger.info("worker_loop_restarting") + + +async def run() -> None: + configure_logging(level=settings.log_level, log_format=settings.log_format) + logger.info("news_crawler_starting") + + config = uvicorn.Config(app, host="0.0.0.0", port=8003, log_level="info") + server = uvicorn.Server(config) + + await asyncio.gather( + worker_loop(), + server.serve(), + return_exceptions=True, + ) + + +def main() -> None: + asyncio.run(run()) + + +if __name__ == "__main__": + main()