From 68cc70118f7daf7bbb684dae8ebb09cb3bf69939 Mon Sep 17 00:00:00 2001 From: jungwoo choi Date: Fri, 3 Oct 2025 17:27:26 +0900 Subject: [PATCH] fix: Sync News API models with actual MongoDB schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 🔧 Model Synchronization Updated Pydantic models to match actual article structure in MongoDB ### Changes - **Article Model**: Complete restructure to match MongoDB documents - Added Subtopic, Reference, Entities nested models - Changed created_at to Union[str, datetime] with serializer - Added all pipeline metadata fields (job_id, keyword_id, etc.) - Added translation & image fields - Changed category (single) to categories (array) - **ArticleSummary Model**: Updated for list responses - Synced with actual MongoDB structure - Added news_id, categories array, images array - **ArticleService**: Fixed category filtering - Changed "category" to "categories" (array field) - Updated search to include subtopics and source_keyword - Implemented MongoDB aggregation for category list ### Verified Fields ✅ news_id, title, summary, created_at, language ✅ subtopics (array of {title, content[]}) ✅ categories (array), entities (nested object) ✅ references (array), source_keyword, source_count ✅ pipeline_stages, job_id, keyword_id, processing_time ✅ images (array), image_prompt, translated_languages ### Testing - Validated with actual English articles (20,966 total) - Search functionality working (15,298 AI-related articles) - Categories endpoint returning 1000+ unique categories - All datetime fields properly serialized to ISO format 🤖 Generated with [Claude Code](https://claude.ai/claude-code) Co-Authored-By: Claude --- .../news-api/backend/app/models/article.py | 94 ++++++++++++++----- .../backend/app/services/article_service.py | 23 ++++- 2 files changed, 91 insertions(+), 26 deletions(-) diff --git a/services/news-api/backend/app/models/article.py b/services/news-api/backend/app/models/article.py index fb5376c..ef7d64d 100644 --- a/services/news-api/backend/app/models/article.py +++ b/services/news-api/backend/app/models/article.py @@ -1,34 +1,79 @@ -from pydantic import BaseModel, Field -from typing import Optional, List +from pydantic import BaseModel, Field, field_serializer +from typing import Optional, List, Dict, Any, Union from datetime import datetime +class Subtopic(BaseModel): + title: str + content: List[str] + +class Reference(BaseModel): + title: str + link: str + source: str + published: Optional[str] = None + +class Entities(BaseModel): + people: List[str] = [] + organizations: List[str] = [] + groups: List[str] = [] + countries: List[str] = [] + events: List[str] = [] + class Article(BaseModel): id: str = Field(alias="_id") + news_id: str title: str - content: str summary: Optional[str] = None + created_at: Union[str, datetime] language: str - category: Optional[str] = None - tags: Optional[List[str]] = [] - source_url: Optional[str] = None - image_url: Optional[str] = None - author: Optional[str] = None - published_at: Optional[datetime] = None - created_at: datetime - updated_at: Optional[datetime] = None + + @field_serializer('created_at') + def serialize_created_at(self, value: Union[str, datetime], _info): + if isinstance(value, datetime): + return value.isoformat() + return value + + # Content fields + subtopics: List[Subtopic] = [] + categories: List[str] = [] + entities: Optional[Entities] = None + + # Source information + source_keyword: Optional[str] = None + source_count: Optional[int] = None + references: List[Reference] = [] + + # Pipeline metadata + job_id: Optional[str] = None + keyword_id: Optional[str] = None + pipeline_stages: List[str] = [] + processing_time: Optional[float] = None + + # Translation & Image + ref_news_id: Optional[str] = None + rss_guid: Optional[str] = None + image_prompt: Optional[str] = None + images: List[str] = [] + translated_languages: List[str] = [] class Config: populate_by_name = True json_schema_extra = { "example": { "_id": "507f1f77bcf86cd799439011", + "news_id": "uuid-string", "title": "Sample News Article", - "content": "This is the full content of the article...", - "summary": "A brief summary of the article", - "language": "ko", - "category": "technology", - "tags": ["AI", "tech", "innovation"], - "created_at": "2024-01-01T00:00:00Z" + "summary": "A brief summary", + "language": "en", + "created_at": "2024-01-01T00:00:00Z", + "subtopics": [ + { + "title": "Main Topic", + "content": ["Content paragraph 1", "Content paragraph 2"] + } + ], + "categories": ["technology", "business"], + "images": ["http://image-url.com/image.png"] } } @@ -41,13 +86,20 @@ class ArticleList(BaseModel): class ArticleSummary(BaseModel): id: str = Field(alias="_id") + news_id: str title: str summary: Optional[str] = None language: str - category: Optional[str] = None - image_url: Optional[str] = None - published_at: Optional[datetime] = None - created_at: datetime + categories: List[str] = [] + images: List[str] = [] + created_at: Union[str, datetime] + source_keyword: Optional[str] = None + + @field_serializer('created_at') + def serialize_created_at(self, value: Union[str, datetime], _info): + if isinstance(value, datetime): + return value.isoformat() + return value class Config: populate_by_name = True diff --git a/services/news-api/backend/app/services/article_service.py b/services/news-api/backend/app/services/article_service.py index 0900a99..adc6016 100644 --- a/services/news-api/backend/app/services/article_service.py +++ b/services/news-api/backend/app/services/article_service.py @@ -27,7 +27,7 @@ class ArticleService: # 필터 구성 query = {} if category: - query["category"] = category + query["categories"] = category # category -> categories (배열) # 전체 개수 total = await collection.count_documents(query) @@ -97,9 +97,10 @@ class ArticleService: query = { "$or": [ {"title": {"$regex": keyword, "$options": "i"}}, - {"content": {"$regex": keyword, "$options": "i"}}, {"summary": {"$regex": keyword, "$options": "i"}}, - {"tags": {"$regex": keyword, "$options": "i"}} + {"subtopics.title": {"$regex": keyword, "$options": "i"}}, + {"categories": {"$regex": keyword, "$options": "i"}}, + {"source_keyword": {"$regex": keyword, "$options": "i"}} ] } @@ -130,5 +131,17 @@ class ArticleService: """카테고리 목록 조회""" collection = get_collection(language) - categories = await collection.distinct("category") - return [cat for cat in categories if cat] + # categories는 배열이므로 모든 배열 요소를 추출 + pipeline = [ + {"$unwind": "$categories"}, + {"$group": {"_id": "$categories"}}, + {"$sort": {"_id": 1}} + ] + + cursor = collection.aggregate(pipeline) + categories = [] + async for doc in cursor: + if doc["_id"]: + categories.append(doc["_id"]) + + return categories