sapiens-mobile/scripts/extract-outlets.py

#!/usr/bin/env python3
"""
Extract outlets from news-api MongoDB collections
"""
import os
from pymongo import MongoClient
from collections import Counter
import json

MONGODB_URL = os.getenv('MONGODB_URL', 'mongodb://localhost:27017')
DB_NAME = 'ai_writer_db'

def extract_outlets():
    client = MongoClient(MONGODB_URL)
    db = client[DB_NAME]

    # Use English collection
    collection = db['articles_en']

    # Get all articles
    articles = list(collection.find({}, {
        'source_keyword': 1,
        'categories': 1,
        'entities': 1,
        'images': 1,
        'summary': 1,
        'title': 1
    }).limit(1000))

    # Extract unique data
    people_map = {}  # name -> {articles, image, description}
    topics_map = {}  # topic -> {articles, image, description}
    companies_map = {}  # company -> {articles, image, description}

    for article in articles:
        source = article.get('source_keyword', '')
        categories = article.get('categories', [])
        entities = article.get('entities', {})
        images = article.get('images', [])
        image = images[0] if images else None

        # Extract topics from categories
        for cat in categories:
            if cat not in topics_map:
                topics_map[cat] = {
                    'id': cat.lower().replace(' ', '-').replace('/', '-'),
                    'name': cat,
                    'category': 'topics',
                    'articles': [],
                    'image': None,
                    'description': f'News and articles about {cat}'
                }
            topics_map[cat]['articles'].append(str(article['_id']))
            if not topics_map[cat]['image'] and image:
                topics_map[cat]['image'] = image

        # Extract people from entities
        for person in entities.get('people', []):
            if person and person not in people_map:
                people_map[person] = {
                    'id': person.lower().replace(' ', '-'),
                    'name': person,
                    'category': 'people',
                    'articles': [],
                    'image': None,
                    'description': f'News and updates about {person}'
                }
            if person and person in people_map:
                people_map[person]['articles'].append(str(article['_id']))
                if not people_map[person]['image'] and image:
                    people_map[person]['image'] = image

        # Extract companies from entities
        for org in entities.get('organizations', []) + entities.get('groups', []):
            if org and org not in companies_map:
                companies_map[org] = {
                    'id': org.lower().replace(' ', '-'),
                    'name': org,
                    'category': 'companies',
                    'articles': [],
                    'image': None,
                    'description': f'Business news about {org}'
                }
            if org and org in companies_map:
                companies_map[org]['articles'].append(str(article['_id']))
                if not companies_map[org]['image'] and image:
                    companies_map[org]['image'] = image

    # Filter outlets with at least 3 articles
    people = [v for v in people_map.values() if len(v['articles']) >= 3]
    topics = [v for v in topics_map.values() if len(v['articles']) >= 3]
    companies = [v for v in companies_map.values() if len(v['articles']) >= 3]

    # Sort by article count
    people.sort(key=lambda x: len(x['articles']), reverse=True)
    topics.sort(key=lambda x: len(x['articles']), reverse=True)
    companies.sort(key=lambda x: len(x['articles']), reverse=True)

    result = {
        'people': people[:50],  # Top 50
        'topics': topics[:50],
        'companies': companies[:50]
    }

    # Save to file
    output_file = '/Users/jungwoochoi/Desktop/prototype/site11/services/sapiens-mobile/outlets-extracted.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    print(f"Extracted {len(people)} people, {len(topics)} topics, {len(companies)} companies")
    print(f"Saved to {output_file}")

    client.close()

if __name__ == '__main__':
    extract_outlets()