#!/usr/bin/env python3 """ Extract outlets from news-api MongoDB collections """ import os from pymongo import MongoClient from collections import Counter import json MONGODB_URL = os.getenv('MONGODB_URL', 'mongodb://localhost:27017') DB_NAME = 'ai_writer_db' def extract_outlets(): client = MongoClient(MONGODB_URL) db = client[DB_NAME] # Use English collection collection = db['articles_en'] # Get all articles articles = list(collection.find({}, { 'source_keyword': 1, 'categories': 1, 'entities': 1, 'images': 1, 'summary': 1, 'title': 1 }).limit(1000)) # Extract unique data people_map = {} # name -> {articles, image, description} topics_map = {} # topic -> {articles, image, description} companies_map = {} # company -> {articles, image, description} for article in articles: source = article.get('source_keyword', '') categories = article.get('categories', []) entities = article.get('entities', {}) images = article.get('images', []) image = images[0] if images else None # Extract topics from categories for cat in categories: if cat not in topics_map: topics_map[cat] = { 'id': cat.lower().replace(' ', '-').replace('/', '-'), 'name': cat, 'category': 'topics', 'articles': [], 'image': None, 'description': f'News and articles about {cat}' } topics_map[cat]['articles'].append(str(article['_id'])) if not topics_map[cat]['image'] and image: topics_map[cat]['image'] = image # Extract people from entities for person in entities.get('people', []): if person and person not in people_map: people_map[person] = { 'id': person.lower().replace(' ', '-'), 'name': person, 'category': 'people', 'articles': [], 'image': None, 'description': f'News and updates about {person}' } if person and person in people_map: people_map[person]['articles'].append(str(article['_id'])) if not people_map[person]['image'] and image: people_map[person]['image'] = image # Extract companies from entities for org in entities.get('organizations', []) + entities.get('groups', []): if org and org not in companies_map: companies_map[org] = { 'id': org.lower().replace(' ', '-'), 'name': org, 'category': 'companies', 'articles': [], 'image': None, 'description': f'Business news about {org}' } if org and org in companies_map: companies_map[org]['articles'].append(str(article['_id'])) if not companies_map[org]['image'] and image: companies_map[org]['image'] = image # Filter outlets with at least 3 articles people = [v for v in people_map.values() if len(v['articles']) >= 3] topics = [v for v in topics_map.values() if len(v['articles']) >= 3] companies = [v for v in companies_map.values() if len(v['articles']) >= 3] # Sort by article count people.sort(key=lambda x: len(x['articles']), reverse=True) topics.sort(key=lambda x: len(x['articles']), reverse=True) companies.sort(key=lambda x: len(x['articles']), reverse=True) result = { 'people': people[:50], # Top 50 'topics': topics[:50], 'companies': companies[:50] } # Save to file output_file = '/Users/jungwoochoi/Desktop/prototype/site11/services/sapiens-mobile/outlets-extracted.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f"Extracted {len(people)} people, {len(topics)} topics, {len(companies)} companies") print(f"Saved to {output_file}") client.close() if __name__ == '__main__': extract_outlets()