React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
117 lines
4.2 KiB
Python
Executable File
117 lines
4.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Extract outlets from news-api MongoDB collections
|
|
"""
|
|
import os
|
|
from pymongo import MongoClient
|
|
from collections import Counter
|
|
import json
|
|
|
|
MONGODB_URL = os.getenv('MONGODB_URL', 'mongodb://localhost:27017')
|
|
DB_NAME = 'ai_writer_db'
|
|
|
|
def extract_outlets():
|
|
client = MongoClient(MONGODB_URL)
|
|
db = client[DB_NAME]
|
|
|
|
# Use English collection
|
|
collection = db['articles_en']
|
|
|
|
# Get all articles
|
|
articles = list(collection.find({}, {
|
|
'source_keyword': 1,
|
|
'categories': 1,
|
|
'entities': 1,
|
|
'images': 1,
|
|
'summary': 1,
|
|
'title': 1
|
|
}).limit(1000))
|
|
|
|
# Extract unique data
|
|
people_map = {} # name -> {articles, image, description}
|
|
topics_map = {} # topic -> {articles, image, description}
|
|
companies_map = {} # company -> {articles, image, description}
|
|
|
|
for article in articles:
|
|
source = article.get('source_keyword', '')
|
|
categories = article.get('categories', [])
|
|
entities = article.get('entities', {})
|
|
images = article.get('images', [])
|
|
image = images[0] if images else None
|
|
|
|
# Extract topics from categories
|
|
for cat in categories:
|
|
if cat not in topics_map:
|
|
topics_map[cat] = {
|
|
'id': cat.lower().replace(' ', '-').replace('/', '-'),
|
|
'name': cat,
|
|
'category': 'topics',
|
|
'articles': [],
|
|
'image': None,
|
|
'description': f'News and articles about {cat}'
|
|
}
|
|
topics_map[cat]['articles'].append(str(article['_id']))
|
|
if not topics_map[cat]['image'] and image:
|
|
topics_map[cat]['image'] = image
|
|
|
|
# Extract people from entities
|
|
for person in entities.get('people', []):
|
|
if person and person not in people_map:
|
|
people_map[person] = {
|
|
'id': person.lower().replace(' ', '-'),
|
|
'name': person,
|
|
'category': 'people',
|
|
'articles': [],
|
|
'image': None,
|
|
'description': f'News and updates about {person}'
|
|
}
|
|
if person and person in people_map:
|
|
people_map[person]['articles'].append(str(article['_id']))
|
|
if not people_map[person]['image'] and image:
|
|
people_map[person]['image'] = image
|
|
|
|
# Extract companies from entities
|
|
for org in entities.get('organizations', []) + entities.get('groups', []):
|
|
if org and org not in companies_map:
|
|
companies_map[org] = {
|
|
'id': org.lower().replace(' ', '-'),
|
|
'name': org,
|
|
'category': 'companies',
|
|
'articles': [],
|
|
'image': None,
|
|
'description': f'Business news about {org}'
|
|
}
|
|
if org and org in companies_map:
|
|
companies_map[org]['articles'].append(str(article['_id']))
|
|
if not companies_map[org]['image'] and image:
|
|
companies_map[org]['image'] = image
|
|
|
|
# Filter outlets with at least 3 articles
|
|
people = [v for v in people_map.values() if len(v['articles']) >= 3]
|
|
topics = [v for v in topics_map.values() if len(v['articles']) >= 3]
|
|
companies = [v for v in companies_map.values() if len(v['articles']) >= 3]
|
|
|
|
# Sort by article count
|
|
people.sort(key=lambda x: len(x['articles']), reverse=True)
|
|
topics.sort(key=lambda x: len(x['articles']), reverse=True)
|
|
companies.sort(key=lambda x: len(x['articles']), reverse=True)
|
|
|
|
result = {
|
|
'people': people[:50], # Top 50
|
|
'topics': topics[:50],
|
|
'companies': companies[:50]
|
|
}
|
|
|
|
# Save to file
|
|
output_file = '/Users/jungwoochoi/Desktop/prototype/site11/services/sapiens-mobile/outlets-extracted.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Extracted {len(people)} people, {len(topics)} topics, {len(companies)} companies")
|
|
print(f"Saved to {output_file}")
|
|
|
|
client.close()
|
|
|
|
if __name__ == '__main__':
|
|
extract_outlets()
|