feat: Refactor outlets with multilingual support and dynamic queries
- Replace static articles array with dynamic source_keyword queries
- Use MongoDB _id as unique identifier for outlets
- Add multilingual translations (9 languages: ko, en, zh_cn, zh_tw, ja, fr, de, es, it)
- Add OutletService for database operations
- Add outlet migration script with Korean source_keyword matching
- Remove JSON file-based outlet loading
- Add /outlets/{outlet_id}/articles endpoint for dynamic article retrieval
This resolves the design issues with:
1. Static articles array requiring constant updates
2. Lack of multilingual support for outlet names/descriptions
3. Broken image URLs
4. Korean entity matching for article queries
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -0,0 +1,129 @@
|
||||
"""
|
||||
Script to add source_keyword field to existing articles based on outlet mappings
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
# MongoDB connection settings
|
||||
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
||||
DB_NAME = os.getenv("DB_NAME", "news_api_db")
|
||||
|
||||
# Supported languages
|
||||
LANGUAGES = ["ko", "en", "zh_cn", "zh_tw", "ja", "fr", "de", "es", "it"]
|
||||
|
||||
async def migrate_article_source_keywords():
|
||||
"""Add source_keyword to articles based on outlet mappings"""
|
||||
|
||||
# Connect to MongoDB
|
||||
client = AsyncIOMotorClient(MONGODB_URL)
|
||||
db = client[DB_NAME]
|
||||
outlets_collection = db.outlets
|
||||
|
||||
# Get all outlets
|
||||
outlets = await outlets_collection.find().to_list(length=None)
|
||||
print(f"Found {len(outlets)} outlets to process")
|
||||
|
||||
# Create mapping from Korean name to source_keyword
|
||||
# Also create reverse mapping for entities matching
|
||||
name_to_keyword = {}
|
||||
for outlet in outlets:
|
||||
# Korean name -> source_keyword
|
||||
name_ko = outlet.get('name') or outlet.get('name_translations', {}).get('ko')
|
||||
if name_ko:
|
||||
name_to_keyword[name_ko] = outlet['source_keyword']
|
||||
|
||||
# Also map the source_keyword to itself for direct matches
|
||||
name_to_keyword[outlet['source_keyword']] = outlet['source_keyword']
|
||||
|
||||
print(f"Created {len(name_to_keyword)} name-to-keyword mappings")
|
||||
|
||||
# Process each language collection
|
||||
total_updated = 0
|
||||
for language in LANGUAGES:
|
||||
collection_name = f"{language}_articles"
|
||||
articles_collection = db[collection_name]
|
||||
|
||||
# Check if collection exists
|
||||
count = await articles_collection.count_documents({})
|
||||
if count == 0:
|
||||
print(f"Skipping empty collection: {collection_name}")
|
||||
continue
|
||||
|
||||
print(f"\nProcessing {collection_name} ({count} articles)...")
|
||||
|
||||
# Process articles in batches
|
||||
batch_size = 100
|
||||
updated_in_lang = 0
|
||||
|
||||
cursor = articles_collection.find({})
|
||||
batch = []
|
||||
|
||||
async for article in cursor:
|
||||
# Extract entities
|
||||
entities = article.get('entities', {})
|
||||
people = entities.get('people', [])
|
||||
organizations = entities.get('organizations', [])
|
||||
groups = entities.get('groups', [])
|
||||
|
||||
# Try to find matching source_keyword
|
||||
source_keyword = None
|
||||
|
||||
# Check people first (most common)
|
||||
for person in people:
|
||||
if person in name_to_keyword:
|
||||
source_keyword = name_to_keyword[person]
|
||||
break
|
||||
|
||||
# Then check organizations
|
||||
if not source_keyword:
|
||||
for org in organizations:
|
||||
if org in name_to_keyword:
|
||||
source_keyword = name_to_keyword[org]
|
||||
break
|
||||
|
||||
# Then check groups
|
||||
if not source_keyword:
|
||||
for group in groups:
|
||||
if group in name_to_keyword:
|
||||
source_keyword = name_to_keyword[group]
|
||||
break
|
||||
|
||||
# If found, update the article
|
||||
if source_keyword:
|
||||
batch.append({
|
||||
'_id': article['_id'],
|
||||
'source_keyword': source_keyword
|
||||
})
|
||||
|
||||
# Execute batch update
|
||||
if len(batch) >= batch_size:
|
||||
for item in batch:
|
||||
await articles_collection.update_one(
|
||||
{'_id': item['_id']},
|
||||
{'$set': {'source_keyword': item['source_keyword']}}
|
||||
)
|
||||
updated_in_lang += len(batch)
|
||||
print(f" Updated {updated_in_lang} articles...", end='\r')
|
||||
batch = []
|
||||
|
||||
# Update remaining batch
|
||||
if batch:
|
||||
for item in batch:
|
||||
await articles_collection.update_one(
|
||||
{'_id': item['_id']},
|
||||
{'$set': {'source_keyword': item['source_keyword']}}
|
||||
)
|
||||
updated_in_lang += len(batch)
|
||||
|
||||
print(f" Updated {updated_in_lang} articles in {collection_name}")
|
||||
total_updated += updated_in_lang
|
||||
|
||||
print(f"\n✓ Migration completed!")
|
||||
print(f"✓ Total articles updated across all languages: {total_updated}")
|
||||
|
||||
# Close connection
|
||||
client.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(migrate_article_source_keywords())
|
||||
Reference in New Issue
Block a user