- Replace static articles array with dynamic source_keyword queries
- Use MongoDB _id as unique identifier for outlets
- Add multilingual translations (9 languages: ko, en, zh_cn, zh_tw, ja, fr, de, es, it)
- Add OutletService for database operations
- Add outlet migration script with Korean source_keyword matching
- Remove JSON file-based outlet loading
- Add /outlets/{outlet_id}/articles endpoint for dynamic article retrieval
This resolves the design issues with:
1. Static articles array requiring constant updates
2. Lack of multilingual support for outlet names/descriptions
3. Broken image URLs
4. Korean entity matching for article queries
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
130 lines
4.4 KiB
Python
130 lines
4.4 KiB
Python
"""
|
|
Script to add source_keyword field to existing articles based on outlet mappings
|
|
"""
|
|
import asyncio
|
|
import os
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
|
|
# MongoDB connection settings
|
|
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
|
|
DB_NAME = os.getenv("DB_NAME", "news_api_db")
|
|
|
|
# Supported languages
|
|
LANGUAGES = ["ko", "en", "zh_cn", "zh_tw", "ja", "fr", "de", "es", "it"]
|
|
|
|
async def migrate_article_source_keywords():
|
|
"""Add source_keyword to articles based on outlet mappings"""
|
|
|
|
# Connect to MongoDB
|
|
client = AsyncIOMotorClient(MONGODB_URL)
|
|
db = client[DB_NAME]
|
|
outlets_collection = db.outlets
|
|
|
|
# Get all outlets
|
|
outlets = await outlets_collection.find().to_list(length=None)
|
|
print(f"Found {len(outlets)} outlets to process")
|
|
|
|
# Create mapping from Korean name to source_keyword
|
|
# Also create reverse mapping for entities matching
|
|
name_to_keyword = {}
|
|
for outlet in outlets:
|
|
# Korean name -> source_keyword
|
|
name_ko = outlet.get('name') or outlet.get('name_translations', {}).get('ko')
|
|
if name_ko:
|
|
name_to_keyword[name_ko] = outlet['source_keyword']
|
|
|
|
# Also map the source_keyword to itself for direct matches
|
|
name_to_keyword[outlet['source_keyword']] = outlet['source_keyword']
|
|
|
|
print(f"Created {len(name_to_keyword)} name-to-keyword mappings")
|
|
|
|
# Process each language collection
|
|
total_updated = 0
|
|
for language in LANGUAGES:
|
|
collection_name = f"{language}_articles"
|
|
articles_collection = db[collection_name]
|
|
|
|
# Check if collection exists
|
|
count = await articles_collection.count_documents({})
|
|
if count == 0:
|
|
print(f"Skipping empty collection: {collection_name}")
|
|
continue
|
|
|
|
print(f"\nProcessing {collection_name} ({count} articles)...")
|
|
|
|
# Process articles in batches
|
|
batch_size = 100
|
|
updated_in_lang = 0
|
|
|
|
cursor = articles_collection.find({})
|
|
batch = []
|
|
|
|
async for article in cursor:
|
|
# Extract entities
|
|
entities = article.get('entities', {})
|
|
people = entities.get('people', [])
|
|
organizations = entities.get('organizations', [])
|
|
groups = entities.get('groups', [])
|
|
|
|
# Try to find matching source_keyword
|
|
source_keyword = None
|
|
|
|
# Check people first (most common)
|
|
for person in people:
|
|
if person in name_to_keyword:
|
|
source_keyword = name_to_keyword[person]
|
|
break
|
|
|
|
# Then check organizations
|
|
if not source_keyword:
|
|
for org in organizations:
|
|
if org in name_to_keyword:
|
|
source_keyword = name_to_keyword[org]
|
|
break
|
|
|
|
# Then check groups
|
|
if not source_keyword:
|
|
for group in groups:
|
|
if group in name_to_keyword:
|
|
source_keyword = name_to_keyword[group]
|
|
break
|
|
|
|
# If found, update the article
|
|
if source_keyword:
|
|
batch.append({
|
|
'_id': article['_id'],
|
|
'source_keyword': source_keyword
|
|
})
|
|
|
|
# Execute batch update
|
|
if len(batch) >= batch_size:
|
|
for item in batch:
|
|
await articles_collection.update_one(
|
|
{'_id': item['_id']},
|
|
{'$set': {'source_keyword': item['source_keyword']}}
|
|
)
|
|
updated_in_lang += len(batch)
|
|
print(f" Updated {updated_in_lang} articles...", end='\r')
|
|
batch = []
|
|
|
|
# Update remaining batch
|
|
if batch:
|
|
for item in batch:
|
|
await articles_collection.update_one(
|
|
{'_id': item['_id']},
|
|
{'$set': {'source_keyword': item['source_keyword']}}
|
|
)
|
|
updated_in_lang += len(batch)
|
|
|
|
print(f" Updated {updated_in_lang} articles in {collection_name}")
|
|
total_updated += updated_in_lang
|
|
|
|
print(f"\n✓ Migration completed!")
|
|
print(f"✓ Total articles updated across all languages: {total_updated}")
|
|
|
|
# Close connection
|
|
client.close()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(migrate_article_source_keywords())
|