site11/services/news-api/backend/scripts/migrate_article_source_keywords.py

"""
Script to add source_keyword field to existing articles based on outlet mappings
"""
import asyncio
import os
from motor.motor_asyncio import AsyncIOMotorClient

# MongoDB connection settings
MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017")
DB_NAME = os.getenv("DB_NAME", "news_api_db")

# Supported languages
LANGUAGES = ["ko", "en", "zh_cn", "zh_tw", "ja", "fr", "de", "es", "it"]

async def migrate_article_source_keywords():
    """Add source_keyword to articles based on outlet mappings"""

    # Connect to MongoDB
    client = AsyncIOMotorClient(MONGODB_URL)
    db = client[DB_NAME]
    outlets_collection = db.outlets

    # Get all outlets
    outlets = await outlets_collection.find().to_list(length=None)
    print(f"Found {len(outlets)} outlets to process")

    # Create mapping from Korean name to source_keyword
    # Also create reverse mapping for entities matching
    name_to_keyword = {}
    for outlet in outlets:
        # Korean name -> source_keyword
        name_ko = outlet.get('name') or outlet.get('name_translations', {}).get('ko')
        if name_ko:
            name_to_keyword[name_ko] = outlet['source_keyword']

        # Also map the source_keyword to itself for direct matches
        name_to_keyword[outlet['source_keyword']] = outlet['source_keyword']

    print(f"Created {len(name_to_keyword)} name-to-keyword mappings")

    # Process each language collection
    total_updated = 0
    for language in LANGUAGES:
        collection_name = f"{language}_articles"
        articles_collection = db[collection_name]

        # Check if collection exists
        count = await articles_collection.count_documents({})
        if count == 0:
            print(f"Skipping empty collection: {collection_name}")
            continue

        print(f"\nProcessing {collection_name} ({count} articles)...")

        # Process articles in batches
        batch_size = 100
        updated_in_lang = 0

        cursor = articles_collection.find({})
        batch = []

        async for article in cursor:
            # Extract entities
            entities = article.get('entities', {})
            people = entities.get('people', [])
            organizations = entities.get('organizations', [])
            groups = entities.get('groups', [])

            # Try to find matching source_keyword
            source_keyword = None

            # Check people first (most common)
            for person in people:
                if person in name_to_keyword:
                    source_keyword = name_to_keyword[person]
                    break

            # Then check organizations
            if not source_keyword:
                for org in organizations:
                    if org in name_to_keyword:
                        source_keyword = name_to_keyword[org]
                        break

            # Then check groups
            if not source_keyword:
                for group in groups:
                    if group in name_to_keyword:
                        source_keyword = name_to_keyword[group]
                        break

            # If found, update the article
            if source_keyword:
                batch.append({
                    '_id': article['_id'],
                    'source_keyword': source_keyword
                })

                # Execute batch update
                if len(batch) >= batch_size:
                    for item in batch:
                        await articles_collection.update_one(
                            {'_id': item['_id']},
                            {'$set': {'source_keyword': item['source_keyword']}}
                        )
                    updated_in_lang += len(batch)
                    print(f"  Updated {updated_in_lang} articles...", end='\r')
                    batch = []

        # Update remaining batch
        if batch:
            for item in batch:
                await articles_collection.update_one(
                    {'_id': item['_id']},
                    {'$set': {'source_keyword': item['source_keyword']}}
                )
            updated_in_lang += len(batch)

        print(f"  Updated {updated_in_lang} articles in {collection_name}")
        total_updated += updated_in_lang

    print(f"\n✓ Migration completed!")
    print(f"✓ Total articles updated across all languages: {total_updated}")

    # Close connection
    client.close()

if __name__ == "__main__":
    asyncio.run(migrate_article_source_keywords())