feat: stock news crawler service

- News crawling from Korean financial news sources - REST API with health, streams, and trigger endpoints - Redis Streams for async job processing - Docker support with multi-stage build Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 13:50:47 +09:00
commit 90748b2d3b
8 changed files with 273 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,8 @@
 __pycache__
 *.pyc
 .git
 .venv
 *.egg-info
 dist
 build
 .env
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,22 @@
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 *.egg-info/
 dist/
 build/
 .eggs/
 .venv/
 venv/
 .env
 .vscode/
 .idea/
 *.swp
 *.swo
 .DS_Store
 .pytest_cache/
 htmlcov/
 .coverage
 coverage.xml
 .mypy_cache/
 .claude/
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 FROM python:3.12-slim
 WORKDIR /app
 COPY stock-common/ /tmp/stock-common/
 RUN pip install --no-cache-dir /tmp/stock-common/ && rm -rf /tmp/stock-common/
 COPY stock-news-crawler/ .
 RUN pip install --no-cache-dir .
 CMD ["python", "-m", "stock_news_crawler.worker"]
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,17 @@
 [project]
 name = "stock-news-crawler"
 version = "0.1.0"
 description = "Naver Finance news crawler service"
 requires-python = ">=3.12"
 dependencies = [
    "stock-common",
    "beautifulsoup4>=4.12",
    "lxml>=5.0",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/stock_news_crawler"]
--- a/src/stock_news_crawler/init.py
+++ b/src/stock_news_crawler/init.py
@ -0,0 +1,3 @@
 """Naver Finance news crawler service."""
 __version__ = "0.1.0"
--- a/src/stock_news_crawler/api.py
+++ b/src/stock_news_crawler/api.py
@ -0,0 +1,35 @@
 """REST API for News crawler service.
 Endpoints:
  POST /collect/news  - trigger news collection
  GET  /health        - (from api_base)
  GET  /streams       - (from api_base)
 """
 from pydantic import BaseModel
 from stock_common.api_base import create_app
 from stock_common.queue import publish
 TRIGGER_STREAM = "queue:trigger-news"
 class NewsRequest(BaseModel):
    stock_codes: list[str]
    max_pages: int = 3
 app = create_app(
    title="stock-news-crawler",
    streams=[TRIGGER_STREAM, "queue:raw-news"],
 )
@app.post("/collect/news")
 async def collect_news(req: NewsRequest):
    msg_id = await publish(TRIGGER_STREAM, {
        "type": "news",
        "stock_codes": req.stock_codes,
        "max_pages": req.max_pages,
    })
    return {"status": "queued", "message_id": msg_id, "stock_codes": req.stock_codes}
--- a/src/stock_news_crawler/crawler.py
+++ b/src/stock_news_crawler/crawler.py
@ -0,0 +1,104 @@
 """Naver Finance news crawler."""
 import asyncio
 import contextlib
 from datetime import datetime
 from typing import Any
 from urllib.parse import urljoin
 from bs4 import BeautifulSoup
 from stock_common.collector_base import BaseCollector
 from stock_common.config import settings
 from stock_common.models.disclosure import Sentiment
 from stock_common.models.news import News
 NAVER_FINANCE_BASE = "https://finance.naver.com"
 NAVER_NEWS_URL = f"{NAVER_FINANCE_BASE}/item/news.naver"
 USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
 ]
 class NewsCollector(BaseCollector):
    """Web scraper for Naver Finance stock news."""
    def __init__(self) -> None:
        super().__init__(rate_limit=settings.news_rate_limit, timeout=15)
        self._ua_index = 0
        self._seen_urls: set[str] = set()
    def _get_user_agent(self) -> str:
        ua = USER_AGENTS[self._ua_index % len(USER_AGENTS)]
        self._ua_index += 1
        return ua
    async def collect(self, **kwargs: Any) -> None:
        raise NotImplementedError("Use collect_news()")
    async def collect_news(self, stock_code: str, max_pages: int = 3) -> list[News]:
        articles: list[News] = []
        for page in range(1, max_pages + 1):
            page_articles = await self._collect_news_page(stock_code, page)
            if not page_articles:
                break
            articles.extend(page_articles)
            await asyncio.sleep(0.5)
        self.logger.info("news_collected", stock_code=stock_code, count=len(articles))
        return articles
    async def _collect_news_page(self, stock_code: str, page: int) -> list[News]:
        params = {"code": stock_code, "page": str(page)}
        headers = {"User-Agent": self._get_user_agent()}
        try:
            html = await self._request("GET", NAVER_NEWS_URL, params=params, headers=headers)
        except Exception as exc:
            self.logger.warning("news_page_fetch_error", stock_code=stock_code, error=str(exc))
            return []
        if not isinstance(html, str):
            return []
        return self._parse_news_page(html, stock_code)
    def _parse_news_page(self, html: str, stock_code: str) -> list[News]:
        soup = BeautifulSoup(html, "lxml")
        articles: list[News] = []
        table = soup.select_one("table.type5")
        if not table:
            return articles
        for row in table.select("tr"):
            try:
                title_cell = row.select_one("td.title a")
                if not title_cell:
                    continue
                title = title_cell.get_text(strip=True)
                href = title_cell.get("href", "")
                if not href:
                    continue
                url = urljoin(NAVER_FINANCE_BASE, str(href))
                if url in self._seen_urls:
                    continue
                self._seen_urls.add(url)
                source_cell = row.select_one("td.info")
                source = source_cell.get_text(strip=True) if source_cell else "naver"
                date_cell = row.select_one("td.date")
                published_at = None
                if date_cell:
                    date_text = date_cell.get_text(strip=True)
                    try:
                        published_at = datetime.strptime(date_text, "%Y.%m.%d %H:%M")
                    except ValueError:
                        with contextlib.suppress(ValueError):
                            published_at = datetime.strptime(date_text, "%Y.%m.%d")
                articles.append(News(
                    url=url, title=title, stock_codes=[stock_code],
                    source=source, sentiment=Sentiment.NEUTRAL,
                    published_at=published_at, collected_at=datetime.now(),
                ))
            except Exception:
                pass
        return articles
--- a/src/stock_news_crawler/worker.py
+++ b/src/stock_news_crawler/worker.py
@ -0,0 +1,73 @@
 """News crawler worker - runs as a standalone service.
 Runs both the Redis stream consumer loop AND the REST API server concurrently.
 """
 import asyncio
 import structlog
 import uvicorn
 from stock_common.config import settings
 from stock_common.logging_config import configure_logging
 from stock_common.queue import consume, publish, ack
 from stock_news_crawler.crawler import NewsCollector
 from stock_news_crawler.api import app
 logger = structlog.get_logger(service="news-crawler")
 TRIGGER_STREAM = "queue:trigger-news"
 OUTPUT_STREAM = "queue:raw-news"
 GROUP = "news-crawlers"
 CONSUMER = "news-worker-1"
 async def worker_loop() -> None:
    """Redis stream consumer loop with auto-restart on failure."""
    while True:
        try:
            async with NewsCollector() as crawler:
                while True:
                    messages = await consume(TRIGGER_STREAM, GROUP, CONSUMER, count=1, block=5000)
                    for msg in messages:
                        stock_codes = msg.data.get("stock_codes", [])
                        max_pages = msg.data.get("max_pages", 3)
                        try:
                            for code in stock_codes:
                                articles = await crawler.collect_news(code, max_pages)
                                for article in articles:
                                    await publish(OUTPUT_STREAM, article.model_dump())
                            await ack(TRIGGER_STREAM, GROUP, msg.message_id)
                            logger.info("task_completed", stock_count=len(stock_codes))
                        except Exception as exc:
                            logger.error("task_failed", error=str(exc))
        except Exception as exc:
            logger.error("worker_loop_crashed", error=str(exc))
            await asyncio.sleep(10)
            logger.info("worker_loop_restarting")
 async def run() -> None:
    configure_logging(level=settings.log_level, log_format=settings.log_format)
    logger.info("news_crawler_starting")
    config = uvicorn.Config(app, host="0.0.0.0", port=8003, log_level="info")
    server = uvicorn.Server(config)
    await asyncio.gather(
        worker_loop(),
        server.serve(),
        return_exceptions=True,
    )
 def main() -> None:
    asyncio.run(run())
 if __name__ == "__main__":
    main()
		`@ -0,0 +1,3 @@`
							`"""Naver Finance news crawler service."""`

							`__version__ = "0.1.0"`