Files
site11/services/news-api/backend/scripts/migrate_article_source_keywords.py
jungwoo choi e467e76d02 feat: Refactor outlets with multilingual support and dynamic queries
- Replace static articles array with dynamic source_keyword queries
- Use MongoDB _id as unique identifier for outlets
- Add multilingual translations (9 languages: ko, en, zh_cn, zh_tw, ja, fr, de, es, it)
- Add OutletService for database operations
- Add outlet migration script with Korean source_keyword matching
- Remove JSON file-based outlet loading
- Add /outlets/{outlet_id}/articles endpoint for dynamic article retrieval

This resolves the design issues with:
1. Static articles array requiring constant updates
2. Lack of multilingual support for outlet names/descriptions
3. Broken image URLs
4. Korean entity matching for article queries

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 16:52:34 +09:00

130 lines
4.4 KiB
Python

"""
Script to add source_keyword field to existing articles based on outlet mappings
"""
import asyncio
import os
from motor.motor_asyncio import AsyncIOMotorClient
# MongoDB connection settings
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
DB_NAME = os.getenv("DB_NAME", "news_api_db")
# Supported languages
LANGUAGES = ["ko", "en", "zh_cn", "zh_tw", "ja", "fr", "de", "es", "it"]
async def migrate_article_source_keywords():
"""Add source_keyword to articles based on outlet mappings"""
# Connect to MongoDB
client = AsyncIOMotorClient(MONGODB_URL)
db = client[DB_NAME]
outlets_collection = db.outlets
# Get all outlets
outlets = await outlets_collection.find().to_list(length=None)
print(f"Found {len(outlets)} outlets to process")
# Create mapping from Korean name to source_keyword
# Also create reverse mapping for entities matching
name_to_keyword = {}
for outlet in outlets:
# Korean name -> source_keyword
name_ko = outlet.get('name') or outlet.get('name_translations', {}).get('ko')
if name_ko:
name_to_keyword[name_ko] = outlet['source_keyword']
# Also map the source_keyword to itself for direct matches
name_to_keyword[outlet['source_keyword']] = outlet['source_keyword']
print(f"Created {len(name_to_keyword)} name-to-keyword mappings")
# Process each language collection
total_updated = 0
for language in LANGUAGES:
collection_name = f"{language}_articles"
articles_collection = db[collection_name]
# Check if collection exists
count = await articles_collection.count_documents({})
if count == 0:
print(f"Skipping empty collection: {collection_name}")
continue
print(f"\nProcessing {collection_name} ({count} articles)...")
# Process articles in batches
batch_size = 100
updated_in_lang = 0
cursor = articles_collection.find({})
batch = []
async for article in cursor:
# Extract entities
entities = article.get('entities', {})
people = entities.get('people', [])
organizations = entities.get('organizations', [])
groups = entities.get('groups', [])
# Try to find matching source_keyword
source_keyword = None
# Check people first (most common)
for person in people:
if person in name_to_keyword:
source_keyword = name_to_keyword[person]
break
# Then check organizations
if not source_keyword:
for org in organizations:
if org in name_to_keyword:
source_keyword = name_to_keyword[org]
break
# Then check groups
if not source_keyword:
for group in groups:
if group in name_to_keyword:
source_keyword = name_to_keyword[group]
break
# If found, update the article
if source_keyword:
batch.append({
'_id': article['_id'],
'source_keyword': source_keyword
})
# Execute batch update
if len(batch) >= batch_size:
for item in batch:
await articles_collection.update_one(
{'_id': item['_id']},
{'$set': {'source_keyword': item['source_keyword']}}
)
updated_in_lang += len(batch)
print(f" Updated {updated_in_lang} articles...", end='\r')
batch = []
# Update remaining batch
if batch:
for item in batch:
await articles_collection.update_one(
{'_id': item['_id']},
{'$set': {'source_keyword': item['source_keyword']}}
)
updated_in_lang += len(batch)
print(f" Updated {updated_in_lang} articles in {collection_name}")
total_updated += updated_in_lang
print(f"\n✓ Migration completed!")
print(f"✓ Total articles updated across all languages: {total_updated}")
# Close connection
client.close()
if __name__ == "__main__":
asyncio.run(migrate_article_source_keywords())