""" Script to add source_keyword field to existing articles based on outlet mappings """ import asyncio import os from motor.motor_asyncio import AsyncIOMotorClient # MongoDB connection settings MONGODB_URL = os.getenv("MONGODB_URL", "mongodb://localhost:27017") DB_NAME = os.getenv("DB_NAME", "news_api_db") # Supported languages LANGUAGES = ["ko", "en", "zh_cn", "zh_tw", "ja", "fr", "de", "es", "it"] async def migrate_article_source_keywords(): """Add source_keyword to articles based on outlet mappings""" # Connect to MongoDB client = AsyncIOMotorClient(MONGODB_URL) db = client[DB_NAME] outlets_collection = db.outlets # Get all outlets outlets = await outlets_collection.find().to_list(length=None) print(f"Found {len(outlets)} outlets to process") # Create mapping from Korean name to source_keyword # Also create reverse mapping for entities matching name_to_keyword = {} for outlet in outlets: # Korean name -> source_keyword name_ko = outlet.get('name') or outlet.get('name_translations', {}).get('ko') if name_ko: name_to_keyword[name_ko] = outlet['source_keyword'] # Also map the source_keyword to itself for direct matches name_to_keyword[outlet['source_keyword']] = outlet['source_keyword'] print(f"Created {len(name_to_keyword)} name-to-keyword mappings") # Process each language collection total_updated = 0 for language in LANGUAGES: collection_name = f"{language}_articles" articles_collection = db[collection_name] # Check if collection exists count = await articles_collection.count_documents({}) if count == 0: print(f"Skipping empty collection: {collection_name}") continue print(f"\nProcessing {collection_name} ({count} articles)...") # Process articles in batches batch_size = 100 updated_in_lang = 0 cursor = articles_collection.find({}) batch = [] async for article in cursor: # Extract entities entities = article.get('entities', {}) people = entities.get('people', []) organizations = entities.get('organizations', []) groups = entities.get('groups', []) # Try to find matching source_keyword source_keyword = None # Check people first (most common) for person in people: if person in name_to_keyword: source_keyword = name_to_keyword[person] break # Then check organizations if not source_keyword: for org in organizations: if org in name_to_keyword: source_keyword = name_to_keyword[org] break # Then check groups if not source_keyword: for group in groups: if group in name_to_keyword: source_keyword = name_to_keyword[group] break # If found, update the article if source_keyword: batch.append({ '_id': article['_id'], 'source_keyword': source_keyword }) # Execute batch update if len(batch) >= batch_size: for item in batch: await articles_collection.update_one( {'_id': item['_id']}, {'$set': {'source_keyword': item['source_keyword']}} ) updated_in_lang += len(batch) print(f" Updated {updated_in_lang} articles...", end='\r') batch = [] # Update remaining batch if batch: for item in batch: await articles_collection.update_one( {'_id': item['_id']}, {'$set': {'source_keyword': item['source_keyword']}} ) updated_in_lang += len(batch) print(f" Updated {updated_in_lang} articles in {collection_name}") total_updated += updated_in_lang print(f"\nāœ“ Migration completed!") print(f"āœ“ Total articles updated across all languages: {total_updated}") # Close connection client.close() if __name__ == "__main__": asyncio.run(migrate_article_source_keywords())