feat: stock news crawler service
- News crawling from Korean financial news sources - REST API with health, streams, and trigger endpoints - Redis Streams for async job processing - Docker support with multi-stage build Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
8
.dockerignore
Normal file
8
.dockerignore
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
.git
|
||||||
|
.venv
|
||||||
|
*.egg-info
|
||||||
|
dist
|
||||||
|
build
|
||||||
|
.env
|
||||||
22
.gitignore
vendored
Normal file
22
.gitignore
vendored
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.eggs/
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
.env
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
.DS_Store
|
||||||
|
.pytest_cache/
|
||||||
|
htmlcov/
|
||||||
|
.coverage
|
||||||
|
coverage.xml
|
||||||
|
.mypy_cache/
|
||||||
|
.claude/
|
||||||
11
Dockerfile
Normal file
11
Dockerfile
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
FROM python:3.12-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY stock-common/ /tmp/stock-common/
|
||||||
|
RUN pip install --no-cache-dir /tmp/stock-common/ && rm -rf /tmp/stock-common/
|
||||||
|
|
||||||
|
COPY stock-news-crawler/ .
|
||||||
|
RUN pip install --no-cache-dir .
|
||||||
|
|
||||||
|
CMD ["python", "-m", "stock_news_crawler.worker"]
|
||||||
17
pyproject.toml
Normal file
17
pyproject.toml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
[project]
|
||||||
|
name = "stock-news-crawler"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Naver Finance news crawler service"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"stock-common",
|
||||||
|
"beautifulsoup4>=4.12",
|
||||||
|
"lxml>=5.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.wheel]
|
||||||
|
packages = ["src/stock_news_crawler"]
|
||||||
3
src/stock_news_crawler/__init__.py
Normal file
3
src/stock_news_crawler/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
"""Naver Finance news crawler service."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
35
src/stock_news_crawler/api.py
Normal file
35
src/stock_news_crawler/api.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
"""REST API for News crawler service.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
POST /collect/news - trigger news collection
|
||||||
|
GET /health - (from api_base)
|
||||||
|
GET /streams - (from api_base)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from stock_common.api_base import create_app
|
||||||
|
from stock_common.queue import publish
|
||||||
|
|
||||||
|
TRIGGER_STREAM = "queue:trigger-news"
|
||||||
|
|
||||||
|
|
||||||
|
class NewsRequest(BaseModel):
|
||||||
|
stock_codes: list[str]
|
||||||
|
max_pages: int = 3
|
||||||
|
|
||||||
|
|
||||||
|
app = create_app(
|
||||||
|
title="stock-news-crawler",
|
||||||
|
streams=[TRIGGER_STREAM, "queue:raw-news"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/collect/news")
|
||||||
|
async def collect_news(req: NewsRequest):
|
||||||
|
msg_id = await publish(TRIGGER_STREAM, {
|
||||||
|
"type": "news",
|
||||||
|
"stock_codes": req.stock_codes,
|
||||||
|
"max_pages": req.max_pages,
|
||||||
|
})
|
||||||
|
return {"status": "queued", "message_id": msg_id, "stock_codes": req.stock_codes}
|
||||||
104
src/stock_news_crawler/crawler.py
Normal file
104
src/stock_news_crawler/crawler.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
"""Naver Finance news crawler."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from stock_common.collector_base import BaseCollector
|
||||||
|
from stock_common.config import settings
|
||||||
|
from stock_common.models.disclosure import Sentiment
|
||||||
|
from stock_common.models.news import News
|
||||||
|
|
||||||
|
NAVER_FINANCE_BASE = "https://finance.naver.com"
|
||||||
|
NAVER_NEWS_URL = f"{NAVER_FINANCE_BASE}/item/news.naver"
|
||||||
|
|
||||||
|
USER_AGENTS = [
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class NewsCollector(BaseCollector):
|
||||||
|
"""Web scraper for Naver Finance stock news."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
super().__init__(rate_limit=settings.news_rate_limit, timeout=15)
|
||||||
|
self._ua_index = 0
|
||||||
|
self._seen_urls: set[str] = set()
|
||||||
|
|
||||||
|
def _get_user_agent(self) -> str:
|
||||||
|
ua = USER_AGENTS[self._ua_index % len(USER_AGENTS)]
|
||||||
|
self._ua_index += 1
|
||||||
|
return ua
|
||||||
|
|
||||||
|
async def collect(self, **kwargs: Any) -> None:
|
||||||
|
raise NotImplementedError("Use collect_news()")
|
||||||
|
|
||||||
|
async def collect_news(self, stock_code: str, max_pages: int = 3) -> list[News]:
|
||||||
|
articles: list[News] = []
|
||||||
|
for page in range(1, max_pages + 1):
|
||||||
|
page_articles = await self._collect_news_page(stock_code, page)
|
||||||
|
if not page_articles:
|
||||||
|
break
|
||||||
|
articles.extend(page_articles)
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
self.logger.info("news_collected", stock_code=stock_code, count=len(articles))
|
||||||
|
return articles
|
||||||
|
|
||||||
|
async def _collect_news_page(self, stock_code: str, page: int) -> list[News]:
|
||||||
|
params = {"code": stock_code, "page": str(page)}
|
||||||
|
headers = {"User-Agent": self._get_user_agent()}
|
||||||
|
try:
|
||||||
|
html = await self._request("GET", NAVER_NEWS_URL, params=params, headers=headers)
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger.warning("news_page_fetch_error", stock_code=stock_code, error=str(exc))
|
||||||
|
return []
|
||||||
|
if not isinstance(html, str):
|
||||||
|
return []
|
||||||
|
return self._parse_news_page(html, stock_code)
|
||||||
|
|
||||||
|
def _parse_news_page(self, html: str, stock_code: str) -> list[News]:
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
articles: list[News] = []
|
||||||
|
table = soup.select_one("table.type5")
|
||||||
|
if not table:
|
||||||
|
return articles
|
||||||
|
|
||||||
|
for row in table.select("tr"):
|
||||||
|
try:
|
||||||
|
title_cell = row.select_one("td.title a")
|
||||||
|
if not title_cell:
|
||||||
|
continue
|
||||||
|
title = title_cell.get_text(strip=True)
|
||||||
|
href = title_cell.get("href", "")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
url = urljoin(NAVER_FINANCE_BASE, str(href))
|
||||||
|
if url in self._seen_urls:
|
||||||
|
continue
|
||||||
|
self._seen_urls.add(url)
|
||||||
|
|
||||||
|
source_cell = row.select_one("td.info")
|
||||||
|
source = source_cell.get_text(strip=True) if source_cell else "naver"
|
||||||
|
date_cell = row.select_one("td.date")
|
||||||
|
published_at = None
|
||||||
|
if date_cell:
|
||||||
|
date_text = date_cell.get_text(strip=True)
|
||||||
|
try:
|
||||||
|
published_at = datetime.strptime(date_text, "%Y.%m.%d %H:%M")
|
||||||
|
except ValueError:
|
||||||
|
with contextlib.suppress(ValueError):
|
||||||
|
published_at = datetime.strptime(date_text, "%Y.%m.%d")
|
||||||
|
|
||||||
|
articles.append(News(
|
||||||
|
url=url, title=title, stock_codes=[stock_code],
|
||||||
|
source=source, sentiment=Sentiment.NEUTRAL,
|
||||||
|
published_at=published_at, collected_at=datetime.now(),
|
||||||
|
))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return articles
|
||||||
73
src/stock_news_crawler/worker.py
Normal file
73
src/stock_news_crawler/worker.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
"""News crawler worker - runs as a standalone service.
|
||||||
|
|
||||||
|
Runs both the Redis stream consumer loop AND the REST API server concurrently.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
from stock_common.config import settings
|
||||||
|
from stock_common.logging_config import configure_logging
|
||||||
|
from stock_common.queue import consume, publish, ack
|
||||||
|
|
||||||
|
from stock_news_crawler.crawler import NewsCollector
|
||||||
|
from stock_news_crawler.api import app
|
||||||
|
|
||||||
|
logger = structlog.get_logger(service="news-crawler")
|
||||||
|
|
||||||
|
TRIGGER_STREAM = "queue:trigger-news"
|
||||||
|
OUTPUT_STREAM = "queue:raw-news"
|
||||||
|
GROUP = "news-crawlers"
|
||||||
|
CONSUMER = "news-worker-1"
|
||||||
|
|
||||||
|
|
||||||
|
async def worker_loop() -> None:
|
||||||
|
"""Redis stream consumer loop with auto-restart on failure."""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
async with NewsCollector() as crawler:
|
||||||
|
while True:
|
||||||
|
messages = await consume(TRIGGER_STREAM, GROUP, CONSUMER, count=1, block=5000)
|
||||||
|
|
||||||
|
for msg in messages:
|
||||||
|
stock_codes = msg.data.get("stock_codes", [])
|
||||||
|
max_pages = msg.data.get("max_pages", 3)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for code in stock_codes:
|
||||||
|
articles = await crawler.collect_news(code, max_pages)
|
||||||
|
for article in articles:
|
||||||
|
await publish(OUTPUT_STREAM, article.model_dump())
|
||||||
|
|
||||||
|
await ack(TRIGGER_STREAM, GROUP, msg.message_id)
|
||||||
|
logger.info("task_completed", stock_count=len(stock_codes))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("task_failed", error=str(exc))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("worker_loop_crashed", error=str(exc))
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
logger.info("worker_loop_restarting")
|
||||||
|
|
||||||
|
|
||||||
|
async def run() -> None:
|
||||||
|
configure_logging(level=settings.log_level, log_format=settings.log_format)
|
||||||
|
logger.info("news_crawler_starting")
|
||||||
|
|
||||||
|
config = uvicorn.Config(app, host="0.0.0.0", port=8003, log_level="info")
|
||||||
|
server = uvicorn.Server(config)
|
||||||
|
|
||||||
|
await asyncio.gather(
|
||||||
|
worker_loop(),
|
||||||
|
server.serve(),
|
||||||
|
return_exceptions=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
asyncio.run(run())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user