feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
116
scripts/extract-outlets.py
Executable file
116
scripts/extract-outlets.py
Executable file
@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract outlets from news-api MongoDB collections
|
||||
"""
|
||||
import os
|
||||
from pymongo import MongoClient
|
||||
from collections import Counter
|
||||
import json
|
||||
|
||||
MONGODB_URL = os.getenv('MONGODB_URL', 'mongodb://localhost:27017')
|
||||
DB_NAME = 'ai_writer_db'
|
||||
|
||||
def extract_outlets():
|
||||
client = MongoClient(MONGODB_URL)
|
||||
db = client[DB_NAME]
|
||||
|
||||
# Use English collection
|
||||
collection = db['articles_en']
|
||||
|
||||
# Get all articles
|
||||
articles = list(collection.find({}, {
|
||||
'source_keyword': 1,
|
||||
'categories': 1,
|
||||
'entities': 1,
|
||||
'images': 1,
|
||||
'summary': 1,
|
||||
'title': 1
|
||||
}).limit(1000))
|
||||
|
||||
# Extract unique data
|
||||
people_map = {} # name -> {articles, image, description}
|
||||
topics_map = {} # topic -> {articles, image, description}
|
||||
companies_map = {} # company -> {articles, image, description}
|
||||
|
||||
for article in articles:
|
||||
source = article.get('source_keyword', '')
|
||||
categories = article.get('categories', [])
|
||||
entities = article.get('entities', {})
|
||||
images = article.get('images', [])
|
||||
image = images[0] if images else None
|
||||
|
||||
# Extract topics from categories
|
||||
for cat in categories:
|
||||
if cat not in topics_map:
|
||||
topics_map[cat] = {
|
||||
'id': cat.lower().replace(' ', '-').replace('/', '-'),
|
||||
'name': cat,
|
||||
'category': 'topics',
|
||||
'articles': [],
|
||||
'image': None,
|
||||
'description': f'News and articles about {cat}'
|
||||
}
|
||||
topics_map[cat]['articles'].append(str(article['_id']))
|
||||
if not topics_map[cat]['image'] and image:
|
||||
topics_map[cat]['image'] = image
|
||||
|
||||
# Extract people from entities
|
||||
for person in entities.get('people', []):
|
||||
if person and person not in people_map:
|
||||
people_map[person] = {
|
||||
'id': person.lower().replace(' ', '-'),
|
||||
'name': person,
|
||||
'category': 'people',
|
||||
'articles': [],
|
||||
'image': None,
|
||||
'description': f'News and updates about {person}'
|
||||
}
|
||||
if person and person in people_map:
|
||||
people_map[person]['articles'].append(str(article['_id']))
|
||||
if not people_map[person]['image'] and image:
|
||||
people_map[person]['image'] = image
|
||||
|
||||
# Extract companies from entities
|
||||
for org in entities.get('organizations', []) + entities.get('groups', []):
|
||||
if org and org not in companies_map:
|
||||
companies_map[org] = {
|
||||
'id': org.lower().replace(' ', '-'),
|
||||
'name': org,
|
||||
'category': 'companies',
|
||||
'articles': [],
|
||||
'image': None,
|
||||
'description': f'Business news about {org}'
|
||||
}
|
||||
if org and org in companies_map:
|
||||
companies_map[org]['articles'].append(str(article['_id']))
|
||||
if not companies_map[org]['image'] and image:
|
||||
companies_map[org]['image'] = image
|
||||
|
||||
# Filter outlets with at least 3 articles
|
||||
people = [v for v in people_map.values() if len(v['articles']) >= 3]
|
||||
topics = [v for v in topics_map.values() if len(v['articles']) >= 3]
|
||||
companies = [v for v in companies_map.values() if len(v['articles']) >= 3]
|
||||
|
||||
# Sort by article count
|
||||
people.sort(key=lambda x: len(x['articles']), reverse=True)
|
||||
topics.sort(key=lambda x: len(x['articles']), reverse=True)
|
||||
companies.sort(key=lambda x: len(x['articles']), reverse=True)
|
||||
|
||||
result = {
|
||||
'people': people[:50], # Top 50
|
||||
'topics': topics[:50],
|
||||
'companies': companies[:50]
|
||||
}
|
||||
|
||||
# Save to file
|
||||
output_file = '/Users/jungwoochoi/Desktop/prototype/site11/services/sapiens-mobile/outlets-extracted.json'
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Extracted {len(people)} people, {len(topics)} topics, {len(companies)} companies")
|
||||
print(f"Saved to {output_file}")
|
||||
|
||||
client.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
extract_outlets()
|
||||
Reference in New Issue
Block a user