feat: SAPIENS Mobile App - Initial commit

React Native mobile application for SAPIENS news platform.
Consolidated all previous history into single commit.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
jungwoo choi
2025-10-23 14:30:25 +09:00
commit 919afe56f2
1516 changed files with 64072 additions and 0 deletions

92
server/index.ts Normal file
View File

@ -0,0 +1,92 @@
import express, { type Request, Response, NextFunction } from "express";
import { registerRoutes } from "./routes";
import { setupVite, serveStatic, log } from "./vite";
const app = express();
// CORS middleware for Expo app
app.use((req, res, next) => {
res.header('Access-Control-Allow-Origin', '*');
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS, PATCH');
res.header('Access-Control-Allow-Headers', 'Content-Type, Authorization');
// Handle preflight requests
if (req.method === 'OPTIONS') {
return res.sendStatus(200);
}
next();
});
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use((req, res, next) => {
const start = Date.now();
const path = req.path;
let capturedJsonResponse: Record<string, any> | undefined = undefined;
const originalResJson = res.json;
res.json = function (bodyJson, ...args) {
capturedJsonResponse = bodyJson;
return originalResJson.apply(res, [bodyJson, ...args]);
};
res.on("finish", () => {
const duration = Date.now() - start;
if (path.startsWith("/api")) {
let logLine = `${req.method} ${path} ${res.statusCode} in ${duration}ms`;
if (capturedJsonResponse) {
logLine += ` :: ${JSON.stringify(capturedJsonResponse)}`;
}
if (logLine.length > 80) {
logLine = logLine.slice(0, 79) + "…";
}
log(logLine);
}
});
next();
});
(async () => {
const server = await registerRoutes(app);
app.use((err: any, _req: Request, res: Response, _next: NextFunction) => {
const status = err.status || err.statusCode || 500;
const message = err.message || "Internal Server Error";
res.status(status).json({ message });
throw err;
});
// importantly only setup vite in development and after
// setting up all the other routes so the catch-all route
// doesn't interfere with the other routes
const isProduction = app.get("env") === "production" || process.env.REPLIT_DEPLOYMENT === "1";
if (!isProduction) {
await setupVite(app, server);
} else {
log("Setting up static file serving for production");
// Protect unmatched API routes from static file serving catch-all
// This should come after all API routes are registered
app.use('/api/*', (req, res) => {
res.status(404).json({ error: `API endpoint not found: ${req.path}` });
});
serveStatic(app);
}
// ALWAYS serve the app on the port specified in the environment variable PORT
// Other ports are firewalled. Default to 5000 if not specified.
// this serves both the API and the client.
// It is the only port that is not firewalled.
const port = parseInt(process.env.PORT || '5000', 10);
server.listen(port, "0.0.0.0", () => {
log(`serving on port ${port}`);
});
})();

532
server/newsapi-client.ts Normal file
View File

@ -0,0 +1,532 @@
/**
* News-API Client
*
* Handles communication with news-api MongoDB and data transformation
* to sapiens-mobile schema format.
*/
import { MongoClient, Db, ObjectId } from 'mongodb';
import fs from 'fs';
import path from 'path';
const MONGODB_URL = process.env.MONGODB_URL || 'mongodb://localhost:27017';
const DB_NAME = 'ai_writer_db';
// Cache for outlets data (loaded from outlets-extracted.json)
let outlets: {
people: any[];
topics: any[];
companies: any[];
} | null = null;
// MongoDB client
let client: MongoClient | null = null;
let db: Db | null = null;
/**
* Initialize MongoDB connection
*/
export async function connectToNewsAPI(): Promise<void> {
if (client) return; // Already connected
try {
client = new MongoClient(MONGODB_URL);
await client.connect();
db = client.db(DB_NAME);
console.log(`Connected to news-api MongoDB: ${DB_NAME}`);
} catch (error) {
console.error('Failed to connect to news-api MongoDB:', error);
throw error;
}
}
/**
* Load outlets from outlets-extracted.json
*/
export function loadOutlets(): any {
if (outlets) return outlets;
try {
const outletsPath = path.resolve(process.cwd(), 'outlets-extracted.json');
const data = fs.readFileSync(outletsPath, 'utf-8');
outlets = JSON.parse(data);
console.log(`Loaded outlets: ${outlets!.people.length} people, ${outlets!.topics.length} topics, ${outlets!.companies.length} companies`);
return outlets;
} catch (error) {
console.error('Failed to load outlets from outlets-extracted.json:', error);
throw error;
}
}
// Multi-language translations for outlet names
const OUTLET_TRANSLATIONS: Record<string, Record<string, string>> = {
'도널드-트럼프': {
en: 'Donald Trump',
ja: 'ドナルド・トランプ',
zh_cn: '唐纳德·特朗普',
zh_tw: '唐納德·川普',
de: 'Donald Trump',
fr: 'Donald Trump',
es: 'Donald Trump',
it: 'Donald Trump'
},
'온유': {
en: 'Onew',
ja: 'オンユ',
zh_cn: '温流',
zh_tw: '溫流',
de: 'Onew',
fr: 'Onew',
es: 'Onew',
it: 'Onew'
},
'사토시-나카모토': {
en: 'Satoshi Nakamoto',
ja: 'サトシ・ナカモト',
zh_cn: '中本聪',
zh_tw: '中本聰',
de: 'Satoshi Nakamoto',
fr: 'Satoshi Nakamoto',
es: 'Satoshi Nakamoto',
it: 'Satoshi Nakamoto'
},
'일론-머스크': {
en: 'Elon Musk',
ja: 'イーロン・マスク',
zh_cn: '埃隆·马斯克',
zh_tw: '伊隆·馬斯克',
de: 'Elon Musk',
fr: 'Elon Musk',
es: 'Elon Musk',
it: 'Elon Musk'
},
'매기-강': {
en: 'Maggie Kang',
ja: 'マギー・カン',
zh_cn: '玛吉·姜',
zh_tw: '瑪姬·姜',
de: 'Maggie Kang',
fr: 'Maggie Kang',
es: 'Maggie Kang',
it: 'Maggie Kang'
},
'제롬-파월': {
en: 'Jerome Powell',
ja: 'ジェローム・パウエル',
zh_cn: '杰罗姆·鲍威尔',
zh_tw: '傑羅姆·鮑威爾',
de: 'Jerome Powell',
fr: 'Jerome Powell',
es: 'Jerome Powell',
it: 'Jerome Powell'
},
'블라디미르-푸틴': {
en: 'Vladimir Putin',
ja: 'ウラジーミル・プーチン',
zh_cn: '弗拉基米尔·普京',
zh_tw: '弗拉基米爾·普丁',
de: 'Wladimir Putin',
fr: 'Vladimir Poutine',
es: 'Vladímir Putin',
it: 'Vladimir Putin'
},
'조-바이든': {
en: 'Joe Biden',
ja: 'ジョー・バイデン',
zh_cn: '乔·拜登',
zh_tw: '喬·拜登',
de: 'Joe Biden',
fr: 'Joe Biden',
es: 'Joe Biden',
it: 'Joe Biden'
},
'블랙핑크': {
en: 'BLACKPINK',
ja: 'ブラックピンク',
zh_cn: 'BLACKPINK',
zh_tw: 'BLACKPINK',
de: 'BLACKPINK',
fr: 'BLACKPINK',
es: 'BLACKPINK',
it: 'BLACKPINK'
},
'구글': {
en: 'Google',
ja: 'グーグル',
zh_cn: '谷歌',
zh_tw: '谷歌',
de: 'Google',
fr: 'Google',
es: 'Google',
it: 'Google'
},
'마이크로소프트': {
en: 'Microsoft',
ja: 'マイクロソフト',
zh_cn: '微软',
zh_tw: '微軟',
de: 'Microsoft',
fr: 'Microsoft',
es: 'Microsoft',
it: 'Microsoft'
},
'넷플릭스': {
en: 'Netflix',
ja: 'ネットフリックス',
zh_cn: '奈飞',
zh_tw: 'Netflix',
de: 'Netflix',
fr: 'Netflix',
es: 'Netflix',
it: 'Netflix'
},
'메타': {
en: 'Meta',
ja: 'メタ',
zh_cn: 'Meta',
zh_tw: 'Meta',
de: 'Meta',
fr: 'Meta',
es: 'Meta',
it: 'Meta'
},
'삼성전자': {
en: 'Samsung Electronics',
ja: 'サムスン電子',
zh_cn: '三星电子',
zh_tw: '三星電子',
de: 'Samsung Electronics',
fr: 'Samsung Electronics',
es: 'Samsung Electronics',
it: 'Samsung Electronics'
},
'아마존': {
en: 'Amazon',
ja: 'アマゾン',
zh_cn: '亚马逊',
zh_tw: '亞馬遜',
de: 'Amazon',
fr: 'Amazon',
es: 'Amazon',
it: 'Amazon'
},
'샤이니': {
en: 'SHINee',
ja: 'シャイニー',
zh_cn: 'SHINee',
zh_tw: 'SHINee',
de: 'SHINee',
fr: 'SHINee',
es: 'SHINee',
it: 'SHINee'
}
};
/**
* Translate outlet name and description based on language
*/
function translateOutlet(outlet: any, language: string): any {
// If Korean, return original
if (language === 'ko') {
return outlet;
}
// Check if we have a translation for this outlet ID
let displayName = outlet.name;
if (OUTLET_TRANSLATIONS[outlet.id] && OUTLET_TRANSLATIONS[outlet.id][language]) {
displayName = OUTLET_TRANSLATIONS[outlet.id][language];
} else if (/[가-힣]/.test(outlet.name)) {
// Fallback: If name contains Korean characters but no translation,
// keep the Korean name as is
displayName = outlet.name;
}
// Translate description pattern
const descriptionTranslations: Record<string, string> = {
'en': `News and updates about ${displayName}`,
'ja': `${displayName}に関するニュースと最新情報`,
'zh_cn': `关于${displayName}的新闻和更新`,
'zh_tw': `關於${displayName}的新聞和更新`,
'de': `Nachrichten und Updates über ${displayName}`,
'fr': `Actualités et mises à jour sur ${displayName}`,
'es': `Noticias y actualizaciones sobre ${displayName}`,
'it': `Notizie e aggiornamenti su ${displayName}`
};
return {
...outlet,
name: displayName,
description: descriptionTranslations[language] || outlet.description
};
}
/**
* Get all outlets or by category
*/
export function getOutlets(category?: string, language = 'ko'): any[] {
const allOutlets = loadOutlets();
// Add focusSubject to each outlet (using name as focusSubject)
const addFocusSubject = (outlets: any[]) =>
outlets.map(outlet => {
const translated = translateOutlet(outlet, language);
return {
...translated,
focusSubject: translated.name || translated.id,
avatar: translated.image
};
});
if (!category) {
return [
...addFocusSubject(allOutlets.people),
...addFocusSubject(allOutlets.topics),
...addFocusSubject(allOutlets.companies)
];
}
switch (category) {
case 'people':
return addFocusSubject(allOutlets.people);
case 'topics':
return addFocusSubject(allOutlets.topics);
case 'companies':
return addFocusSubject(allOutlets.companies);
default:
return [];
}
}
/**
* Get outlet by ID
*/
export function getOutletById(id: string, language = 'ko'): any | null {
const allOutlets = getOutlets(undefined, language);
const outlet = allOutlets.find(outlet => outlet.id === id);
if (!outlet) return null;
// Add focusSubject and avatar if not present
return {
...outlet,
focusSubject: outlet.focusSubject || outlet.name || outlet.id,
avatar: outlet.avatar || outlet.image
};
}
/**
* Get articles for an outlet
*/
export async function getArticlesByOutlet(outletId: string, limit = 50, language = 'en'): Promise<any[]> {
await connectToNewsAPI();
if (!db) throw new Error('Database not connected');
const outlet = getOutletById(outletId);
if (!outlet) return [];
const articleIds = outlet.articles.slice(0, limit).map((id: string) => new ObjectId(id));
// First, get news_ids from English collection
const enCollection = db.collection('articles_en');
const enArticles = await enCollection.find({
_id: { $in: articleIds }
}, { projection: { news_id: 1, _id: 1 } }).toArray();
const newsIds = enArticles.map((a: any) => a.news_id).filter(Boolean);
if (newsIds.length === 0) return [];
// Then get articles from target language collection using news_ids
const collectionName = `articles_${language}`;
const collection = db.collection(collectionName);
const articles = await collection.find({
news_id: { $in: newsIds }
}).toArray();
// Create a map from news_id to both article and English ID
const newsIdToData = new Map(articles.map((a: any) => {
const enArticle = enArticles.find(en => en.news_id === a.news_id);
return [a.news_id, { article: a, englishId: enArticle?._id.toString() }];
}));
// Sort articles in the same order as outlet.articles
const sortedArticles = enArticles
.map((en: any) => {
const data = newsIdToData.get(en.news_id);
return data ? { ...data.article, _englishId: data.englishId } : null;
})
.filter(Boolean);
return sortedArticles.map(a => transformArticle(a, a._englishId));
}
/**
* Get article by news_id (preferred for cross-language support)
*/
export async function getArticleByNewsId(newsId: string, language = 'en'): Promise<any | null> {
await connectToNewsAPI();
if (!db) throw new Error('Database not connected');
console.log(`[newsapi-client.getArticleByNewsId] newsId=${newsId}, language=${language}`);
const collectionName = `articles_${language}`;
const collection = db.collection(collectionName);
const article = await collection.findOne({ news_id: newsId });
if (!article) {
console.log(`[newsapi-client.getArticleByNewsId] Article not found in ${collectionName}`);
return null;
}
// Get English article ID for outlet lookup
const enCollection = db.collection('articles_en');
const enArticle = await enCollection.findOne(
{ news_id: newsId },
{ projection: { _id: 1 } }
);
console.log(`[newsapi-client.getArticleByNewsId] Found article in ${collectionName}: ${article.title}`);
return transformArticle(article, enArticle?._id.toString());
}
/**
* Get article by ID (for backward compatibility)
*/
export async function getArticleById(id: string, language = 'en'): Promise<any | null> {
await connectToNewsAPI();
if (!db) throw new Error('Database not connected');
console.log(`[newsapi-client.getArticleById] id=${id}, language=${language}`);
// First, try to find the article directly in the requested language collection
const collectionName = `articles_${language}`;
const collection = db.collection(collectionName);
let article = await collection.findOne({ _id: new ObjectId(id) });
if (article) {
console.log(`[newsapi-client.getArticleById] Found article directly in ${collectionName}: ${article.title}`);
return transformArticle(article, id); // Pass the ID as englishArticleId
}
// If not found, the ID might be from English collection
// Try to find it in English collection and get its news_id
const enCollection = db.collection('articles_en');
const enArticle = await enCollection.findOne(
{ _id: new ObjectId(id) },
{ projection: { news_id: 1 } }
);
console.log(`[newsapi-client.getArticleById] Checked English collection, news_id: ${enArticle?.news_id}`);
if (!enArticle || !enArticle.news_id) {
console.log(`[newsapi-client.getArticleById] Article not found in any collection`);
return null;
}
// If requesting English, get it from English collection
if (language === 'en') {
const enFullArticle = await enCollection.findOne({ _id: new ObjectId(id) });
if (!enFullArticle) return null;
console.log(`[newsapi-client.getArticleById] Returning English article: ${enFullArticle.title}`);
return transformArticle(enFullArticle);
}
// For other languages, get article using news_id
console.log(`[newsapi-client.getArticleById] Querying ${collectionName} with news_id: ${enArticle.news_id}`);
article = await collection.findOne({ news_id: enArticle.news_id });
if (!article) {
console.log(`[newsapi-client.getArticleById] No article found in ${collectionName} with news_id ${enArticle.news_id}`);
return null;
}
console.log(`[newsapi-client.getArticleById] Found article in ${collectionName}: ${article.title}`);
return transformArticle(article, id); // Pass the English ID
}
/**
* Search articles
*/
export async function searchArticles(query: string, limit = 20, language = 'en'): Promise<any[]> {
await connectToNewsAPI();
if (!db) throw new Error('Database not connected');
const collectionName = `articles_${language}`;
const collection = db.collection(collectionName);
const articles = await collection.find({
$or: [
{ title: { $regex: query, $options: 'i' } },
{ summary: { $regex: query, $options: 'i' } },
{ body: { $regex: query, $options: 'i' } }
]
}).limit(limit).toArray();
return articles.map(transformArticle);
}
/**
* Transform news-api article to sapiens-mobile format
*/
function transformArticle(article: any, englishArticleId?: string): any {
// Find which outlet this article belongs to
// Use englishArticleId if provided (for non-English articles), otherwise use current article's _id
const allOutlets = getOutlets();
const articleIdStr = englishArticleId || article._id.toString();
const outlet = allOutlets.find(o => o.articles.includes(articleIdStr));
// Extract the first image or use default
const images = article.images || [];
const thumbnail = images.length > 0 ? images[0] : '/api/assets/default-article.png';
// Format time ago
const publishedAt = article.created_at || new Date();
const now = new Date();
const diffInMinutes = Math.floor((now.getTime() - new Date(publishedAt).getTime()) / 60000);
const clampedMinutes = Math.max(1, Math.min(59, diffInMinutes));
// Ensure tags is always an array of strings
let tags: string[] = [];
if (article.subtopics) {
if (Array.isArray(article.subtopics)) {
tags = article.subtopics
.map((t: any) => {
if (typeof t === 'string') return t;
if (t && typeof t === 'object' && t.title) return t.title;
return null;
})
.filter((t: any) => t !== null);
} else if (typeof article.subtopics === 'string') {
tags = [article.subtopics];
}
}
return {
id: article._id.toString(),
newsId: article.news_id || article._id.toString(), // Add news_id for cross-language navigation
title: article.title || 'Untitled',
summary: article.summary || '',
body: article.body || article.summary || '',
thumbnail,
publishedAt: publishedAt,
timeAgo: `${clampedMinutes} min ago`,
outletId: outlet?.id || 'unknown',
outletName: outlet?.name || 'Unknown',
tags,
subtopics: article.subtopics || [],
viewCount: 0,
category: outlet?.category || 'topics'
};
}
/**
* Close MongoDB connection
*/
export async function closeNewsAPIConnection(): Promise<void> {
if (client) {
await client.close();
client = null;
db = null;
console.log('Closed news-api MongoDB connection');
}
}

75
server/openai-utils.ts Normal file
View File

@ -0,0 +1,75 @@
import OpenAI from "openai";
// the newest OpenAI model is "gpt-5" which was released August 7, 2025. do not change this unless explicitly requested by the user
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
export interface MediaOutletProfile {
name: string;
category: string;
focusSubject: string;
bio: string;
fullBio: string[];
}
export async function generateDetailedProfile(outlet: MediaOutletProfile): Promise<string> {
try {
const prompt = `Create a comprehensive, Wikipedia-style profile for the following media outlet/subject. The profile should be detailed, extensive, and well-structured with multiple sections. Write in a neutral, encyclopedic tone similar to Wikipedia articles.
Media Outlet Information:
- Name: ${outlet.name}
- Category: ${outlet.category}
- Focus Subject: ${outlet.focusSubject}
- Current Bio: ${outlet.bio}
- Additional Info: ${outlet.fullBio.join(' ')}
Please create a detailed profile that includes:
1. **Overview/Introduction** - Comprehensive introduction paragraph
2. **Background & History** - Detailed history and formation
3. **Key Achievements** - Major accomplishments and milestones
4. **Technology & Innovation** - Technical aspects, innovations, or methodologies (if applicable)
5. **Market Position & Influence** - Position in industry/market and influence
6. **Notable Developments** - Significant events, partnerships, or developments
7. **Future Outlook** - Current projects and future direction
8. **Industry Impact** - Broader impact on the industry or field
For people: Include personal background, career history, education, major contributions, and influence.
For companies: Include founding story, business model, products/services, market position, and key partnerships.
For topics/technologies: Include technical background, development history, applications, and significance.
Make it comprehensive and informative, similar to a detailed Wikipedia article. Use HTML formatting with proper headings (h2, h3), paragraphs, and lists where appropriate. Aim for 2000-3000 words.
Respond with only the HTML content, no markdown or additional formatting.`;
const response = await openai.chat.completions.create({
model: "gpt-5",
messages: [{ role: "user", content: prompt }],
max_completion_tokens: 4000
});
return response.choices[0].message.content || "";
} catch (error) {
console.error("Error generating detailed profile:", error);
throw new Error(`Failed to generate profile: ${error instanceof Error ? error.message : "Unknown error"}`);
}
}
export async function generateBatchProfiles(outlets: MediaOutletProfile[]): Promise<Record<string, string>> {
const profiles: Record<string, string> = {};
for (const outlet of outlets) {
try {
console.log(`Generating profile for ${outlet.name}...`);
const profile = await generateDetailedProfile(outlet);
profiles[outlet.name] = profile;
// Add a small delay to avoid rate limiting
await new Promise(resolve => setTimeout(resolve, 1000));
} catch (error) {
console.error(`Failed to generate profile for ${outlet.name}:`, error);
profiles[outlet.name] = `<h2>Profile Generation Error</h2><p>Unable to generate detailed profile for ${outlet.name}. Please try again later.</p>`;
}
}
return profiles;
}

315
server/outletParser.ts Normal file
View File

@ -0,0 +1,315 @@
import { readFileSync } from 'fs';
import { join } from 'path';
export interface OutletLinks {
name: string;
category: 'people' | 'topics' | 'companies';
focusSubject: string;
urls: string[];
}
export interface ParsedOutlets {
people: OutletLinks[];
topics: OutletLinks[];
companies: OutletLinks[];
total: number;
}
export class OutletParser {
// Parse the attached file and return structured outlet data
static parseOutletFile(filePath: string): ParsedOutlets {
try {
console.log(`Parsing outlet file: ${filePath}`);
const content = readFileSync(filePath, 'utf-8');
const lines = content.split('\n').map(line => line.trim()).filter(line => line);
const parsed: ParsedOutlets = {
people: [],
topics: [],
companies: [],
total: 0,
};
let currentCategory: 'people' | 'topics' | 'companies' | null = null;
let currentOutlet: OutletLinks | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Skip empty lines
if (!line) continue;
// Detect section headers FIRST (before skipping other # lines)
if (line.includes('## People')) {
currentCategory = 'people';
continue;
} else if (line.includes('## Topics')) {
currentCategory = 'topics';
continue;
} else if (line.includes('## Companies') || line.startsWith('📋 Companies')) {
currentCategory = 'companies';
continue;
}
// Skip other markdown headers (after section detection)
if (line.startsWith('#') && !line.startsWith('###')) continue;
// Parse outlet headers like "### 1. Ala Shaabana - Bittensor 공동창립자"
if (line.startsWith('###') && currentCategory) {
// Save previous outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
// Extract outlet name (remove ### and number)
const nameMatch = line.match(/###\s*\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
if (nameMatch) {
const rawName = nameMatch[1].trim();
const cleanedName = this.cleanOutletName(rawName);
currentOutlet = {
name: cleanedName,
category: currentCategory,
focusSubject: this.generateFocusSubject(cleanedName),
urls: []
};
}
continue;
}
// Parse numbered URLs like "1. https://example.com"
const urlMatch = line.match(/^\d+\.\s*(https?:\/\/.+)$/);
if (urlMatch && currentOutlet) {
currentOutlet.urls.push(urlMatch[1]);
continue;
}
// Parse direct URLs for companies section
if (line.startsWith('http://') || line.startsWith('https://')) {
if (currentOutlet) {
currentOutlet.urls.push(line);
}
continue;
}
// Parse company entries like "1. Ava Labs (Avalanche 플랫폼)"
if (currentCategory === 'companies' && /^\d+\.\s*[A-Za-z]/.test(line) && !line.startsWith('http')) {
// Save previous outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
const companyMatch = line.match(/^\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
if (companyMatch) {
const rawName = companyMatch[1].trim();
const cleanedName = this.cleanOutletName(rawName);
currentOutlet = {
name: cleanedName,
category: 'companies',
focusSubject: this.generateFocusSubject(cleanedName),
urls: []
};
}
}
}
// Don't forget the last outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
console.log(`Successfully parsed ${parsed.total} outlets:`);
console.log(`- People: ${parsed.people.length}`);
console.log(`- Topics: ${parsed.topics.length}`);
console.log(`- Companies: ${parsed.companies.length}`);
return parsed;
} catch (error: any) {
console.error('Error parsing outlet file:', error.message);
throw new Error(`Failed to parse outlet file: ${error.message}`);
}
}
// Clean outlet names by removing Korean descriptions and normalizing
private static cleanOutletName(rawName: string): string {
// Remove Korean parenthetical descriptions like "(연방준비제도 의장)" or "(OSS Capital 창립자)"
let cleaned = rawName.replace(/\s*\([^)]*\)/g, '').trim();
// Handle special cases
const specialCases: { [key: string]: string } = {
'CBDC': 'Central Bank Digital Currency',
'CFTC': 'Commodity Futures Trading Commission',
'SEC': 'Securities and Exchange Commission',
'DAT': 'Digital Asset Treasury',
'DeFi': 'Decentralized Finance',
'DEX': 'Decentralized Exchange',
'NFT': 'Non-Fungible Token',
'RWA': 'Real World Assets',
'SWF': 'Sovereign Wealth Fund',
};
return specialCases[cleaned] || cleaned;
}
// Generate focus subject for database compatibility
private static generateFocusSubject(rawName: string): string {
let subject = rawName.replace(/\s*\([^)]*\)/g, '').trim();
// Convert to lowercase and replace spaces with dashes for ID compatibility
return subject.toLowerCase()
.replace(/[^a-z0-9\s-]/g, '') // Remove special characters
.replace(/\s+/g, '-') // Replace spaces with dashes
.replace(/--+/g, '-') // Replace multiple dashes with single dash
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
}
// Categorize outlet based on name
private static categorizeOutlet(name: string): 'people' | 'topics' | 'companies' {
const cleanName = name.toLowerCase().trim();
// People (individual names)
const people = [
'ala shaabana', 'alex karp', 'arthur hayes', 'donald trump jr', 'eric trump',
'jacob robert steeves', 'jared kushner', 'j.d. vance', 'jensen huang',
'jerome powell', 'joseph jacks', 'robert myers', 'yat siu'
];
// Companies
const companies = [
'xtao', 'yuma', 'taox', 'oblong', 'ava labs', 'boston dynamics',
'blackrock', 'chainlink', 'circle', 'cme group', 'manifold labs'
];
// Check for exact matches first
if (people.some(person => cleanName.includes(person) || person.includes(cleanName))) {
return 'people';
}
if (companies.some(company => cleanName.includes(company) || company.includes(cleanName))) {
return 'companies';
}
// Everything else goes to topics
return 'topics';
}
// Get specific outlet data by name
static getOutletByName(parsed: ParsedOutlets, name: string): OutletLinks | null {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.find(outlet =>
outlet.name.toLowerCase() === name.toLowerCase() ||
outlet.focusSubject === name
) || null;
}
// Get all URLs from parsed data
static getAllUrls(parsed: ParsedOutlets): string[] {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.flatMap(outlet => outlet.urls);
}
// Get URLs by category
static getUrlsByCategory(parsed: ParsedOutlets, category: 'people' | 'topics' | 'companies'): string[] {
return parsed[category].flatMap(outlet => outlet.urls);
}
// Convert parsed data to our existing outlet format
static convertToOutletFormat(parsed: ParsedOutlets): Array<{
id: string;
name: string;
description: string;
category: string;
focusSubject: string;
avatar?: string;
profileImage?: string;
bio: string;
fullBio?: string[];
urls: string[];
}> {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.map(outlet => ({
id: outlet.focusSubject,
name: outlet.name,
description: this.generateDescription(outlet),
category: outlet.category,
focusSubject: outlet.focusSubject,
avatar: this.getDefaultAvatar(outlet.category),
profileImage: this.getDefaultProfileImage(outlet.category),
bio: this.generateBio(outlet),
fullBio: this.generateFullBio(outlet),
urls: outlet.urls,
}));
}
private static generateDescription(outlet: OutletLinks): string {
const descriptions = {
people: `Latest news and analysis about ${outlet.name}`,
topics: `Comprehensive coverage of ${outlet.name} developments and trends`,
companies: `${outlet.name} news, updates, and market analysis`,
};
return descriptions[outlet.category];
}
private static getDefaultAvatar(category: string): string {
const avatars = {
people: '/api/assets/default-person.jpg',
topics: '/api/assets/default-topic.jpg',
companies: '/api/assets/default-company.jpg',
};
return avatars[category as keyof typeof avatars] || avatars.topics;
}
private static getDefaultProfileImage(category: string): string {
return this.getDefaultAvatar(category);
}
private static generateBio(outlet: OutletLinks): string {
const bios = {
people: `${outlet.name} is a prominent figure in technology and business, making headlines with strategic decisions and market insights.`,
topics: `Stay informed about the latest developments in ${outlet.name} with comprehensive coverage and expert analysis.`,
companies: `${outlet.name} continues to shape the industry with innovative solutions and strategic partnerships.`,
};
return bios[outlet.category];
}
private static generateFullBio(outlet: OutletLinks): string[] {
const fullBios = {
people: [
`${outlet.name} is a key figure in the technology and business landscape.`,
`Known for strategic leadership and innovative thinking in their field.`,
`Continues to influence industry trends and developments globally.`
],
topics: [
`${outlet.name} represents a critical area of technological advancement.`,
`Dynamic sector with ongoing market trends, regulatory updates, and innovations.`,
`Comprehensive resource requiring expert analysis from leading industry professionals.`
],
companies: [
`${outlet.name} is a significant player in the technology industry.`,
`Known for innovative products and strategic market positioning.`,
`Continues to drive industry growth and technological advancement.`
]
};
return fullBios[outlet.category];
}
}
// Utility function to parse the specific file
export function parseAttachedOutletFile(): ParsedOutlets {
const filePath = join(process.cwd(), 'attached_assets', 'Pasted-Ala-Shaabana-https-www-rootdata-com-news-323625-https-ffnews-com-newsarticle-funding-xtao-tsx-v-1758557992922_1758557992922.txt');
return OutletParser.parseOutletFile(filePath);
}
export default OutletParser;

1772
server/routes.ts Normal file

File diff suppressed because it is too large Load Diff

445
server/scraper.ts Normal file
View File

@ -0,0 +1,445 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { createWriteStream, existsSync, mkdirSync } from 'fs';
import { join } from 'path';
import sharp from 'sharp';
interface ScrapedArticle {
url: string;
title: string;
summary: string;
body: string;
imageUrl?: string;
publishedAt: Date;
author?: string;
tags: string[];
}
interface ScrapedImage {
url: string;
filename: string;
alt?: string;
width?: number;
height?: number;
}
export class WebScraper {
private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
private static readonly TIMEOUT = 30000; // 30 seconds
private static readonly MAX_RETRIES = 3;
constructor() {
// Create assets directory if it doesn't exist
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
if (!existsSync(assetsDir)) {
mkdirSync(assetsDir, { recursive: true });
}
}
// Main scraping method with retry logic
async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
try {
console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
const response = await axios.get(url, {
headers: {
'User-Agent': WebScraper.USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
timeout: WebScraper.TIMEOUT,
maxRedirects: 5,
});
const $ = cheerio.load(response.data);
// Extract article data using multiple selectors for different news sites
const article = await this.extractArticleData($, url);
if (!article) {
console.warn(`Failed to extract article data from ${url}`);
return null;
}
console.log(`Successfully scraped: ${article.title}`);
return article;
} catch (error: any) {
console.error(`Error scraping ${url}:`, error.message);
if (retryCount < WebScraper.MAX_RETRIES) {
console.log(`Retrying ${url} in 2 seconds...`);
await new Promise(resolve => setTimeout(resolve, 2000));
return this.scrapeArticle(url, retryCount + 1);
}
return null;
}
}
// Extract article data with fallback selectors for different news sites
private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
const title = this.extractTitle($);
const summary = this.extractSummary($);
const body = this.extractBody($);
const imageUrl = this.extractMainImage($, url);
const publishedAt = this.extractPublishedDate($);
const author = this.extractAuthor($);
const tags = this.extractTags($);
if (!title || !body) {
console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
return null;
}
return {
url,
title,
summary: summary || (await this.generateSummaryFromBody(body, title)),
body,
imageUrl,
publishedAt: publishedAt || new Date(),
author,
tags,
};
}
private extractTitle($: cheerio.CheerioAPI): string {
// Try multiple common title selectors
const selectors = [
'h1.article-title',
'h1.entry-title',
'h1[class*="headline"]',
'h1[class*="title"]',
'.article-header h1',
'.post-title',
'h1',
'title',
'[property="og:title"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length && element.text().trim()) {
return element.first().text().trim();
}
}
return '';
}
private extractSummary($: cheerio.CheerioAPI): string {
const selectors = [
'.article-summary',
'.entry-summary',
'.article-excerpt',
'.post-excerpt',
'[class*="summary"]',
'[class*="excerpt"]',
'[property="og:description"]',
'[name="description"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
const text = selector.includes('property') || selector.includes('name')
? element.attr('content')
: element.text();
if (text && text.trim()) {
return text.trim();
}
}
}
return '';
}
private extractBody($: cheerio.CheerioAPI): string {
const selectors = [
'.article-content',
'.entry-content',
'.post-content',
'.article-body',
'[class*="content"]',
'.story-body',
'.article p',
'.post p',
];
for (const selector of selectors) {
const elements = $(selector);
if (elements.length) {
// Get text from all paragraphs and join them
let bodyText = '';
elements.each((_, el) => {
const text = $(el).text().trim();
if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
bodyText += text + '\n\n';
}
});
if (bodyText.length > 200) { // Ensure we have substantial content
return bodyText.trim();
}
}
}
return '';
}
private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
const selectors = [
'.article-image img',
'.featured-image img',
'[class*="hero"] img',
'.post-thumbnail img',
'[property="og:image"]',
'meta[property="og:image"]',
'.article img:first',
'img[class*="featured"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
let src = selector.includes('property') || selector.includes('meta')
? element.attr('content')
: element.attr('src');
if (src) {
// Convert relative URLs to absolute
if (src.startsWith('//')) {
src = 'https:' + src;
} else if (src.startsWith('/')) {
const urlObj = new URL(baseUrl);
src = `${urlObj.origin}${src}`;
}
if (src.startsWith('http')) {
return src;
}
}
}
}
return undefined;
}
private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
const selectors = [
'[property="article:published_time"]',
'[name="publish-date"]',
'.publish-date',
'.article-date',
'.entry-date',
'time[datetime]',
'[class*="date"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
let dateStr = element.attr('content') || element.attr('datetime') || element.text();
if (dateStr) {
const date = new Date(dateStr);
if (!isNaN(date.getTime())) {
return date;
}
}
}
}
return null;
}
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
const selectors = [
'[rel="author"]',
'.author-name',
'.byline',
'[class*="author"]',
'[property="article:author"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
const author = selector.includes('property')
? element.attr('content')
: element.text();
if (author && author.trim()) {
return author.trim();
}
}
}
return undefined;
}
private extractTags($: cheerio.CheerioAPI): string[] {
const tags = new Set<string>();
// Extract from various tag selectors
const selectors = [
'.tags a',
'.tag-list a',
'[class*="tag"] a',
'.categories a',
'[rel="tag"]',
];
for (const selector of selectors) {
$(selector).each((_, el) => {
const tag = $(el).text().trim();
if (tag) {
tags.add(tag);
}
});
}
return Array.from(tags);
}
private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
try {
// Try AI-powered summary generation first
const response = await axios.post('http://localhost:5000/api/generate-summary', {
content: body,
title: title || ''
}, {
timeout: 10000, // 10 second timeout
headers: {
'Content-Type': 'application/json'
}
});
if (response.data?.summary) {
console.log('✅ AI-generated summary created');
return response.data.summary;
}
} catch (error) {
console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
}
// Fallback to basic summary generation (improved version)
const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
const sentences = cleanedBody.split(/[.!?]+/);
let summary = '';
for (const sentence of sentences) {
const trimmed = sentence.trim();
if (trimmed.length < 10) continue; // Skip very short sentences
if (summary.length + trimmed.length > 150) break;
summary += (summary ? '. ' : '') + trimmed;
}
return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
}
// Download and process images
async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
try {
console.log(`Downloading image: ${imageUrl}`);
const response = await axios.get(imageUrl, {
responseType: 'stream',
headers: {
'User-Agent': WebScraper.USER_AGENT,
},
timeout: WebScraper.TIMEOUT,
});
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
const imagePath = join(assetsDir, filename);
const writer = createWriteStream(imagePath);
response.data.pipe(writer);
await new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
// Get image metadata
const metadata = await sharp(imagePath).metadata();
console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
return {
url: imageUrl,
filename,
width: metadata.width,
height: metadata.height,
};
} catch (error: any) {
console.error(`Error downloading image ${imageUrl}:`, error.message);
return null;
}
}
// Create thumbnail from downloaded image
async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
try {
await sharp(imagePath)
.resize(size, size, {
fit: 'cover',
position: 'center',
})
.jpeg({ quality: 80 })
.toFile(thumbnailPath);
return true;
} catch (error: any) {
console.error(`Error creating thumbnail:`, error.message);
return false;
}
}
// Batch scraping with concurrency control - returns both successes and failures
async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
successes: ScrapedArticle[];
failures: Array<{url: string; error: string}>;
}> {
const successes: ScrapedArticle[] = [];
const failures: Array<{url: string; error: string}> = [];
const chunks = this.chunkArray(urls, maxConcurrency);
for (const chunk of chunks) {
const promises = chunk.map(url =>
this.scrapeArticle(url)
.then(result => ({ url, result, error: null }))
.catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
);
const chunkResults = await Promise.all(promises);
// Separate successes and failures
for (const { url, result, error } of chunkResults) {
if (result) {
successes.push(result);
} else {
failures.push({ url, error: error || 'Failed to scrape' });
}
}
// Small delay between batches to be respectful to servers
await new Promise(resolve => setTimeout(resolve, 1000));
}
return { successes, failures };
}
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
const chunks: T[][] = [];
for (let i = 0; i < array.length; i += chunkSize) {
chunks.push(array.slice(i, i + chunkSize));
}
return chunks;
}
}

1969
server/storage.ts Normal file

File diff suppressed because it is too large Load Diff

85
server/vite.ts Normal file
View File

@ -0,0 +1,85 @@
import express, { type Express } from "express";
import fs from "fs";
import path from "path";
import { createServer as createViteServer, createLogger } from "vite";
import { type Server } from "http";
import viteConfig from "../vite.config";
import { nanoid } from "nanoid";
const viteLogger = createLogger();
export function log(message: string, source = "express") {
const formattedTime = new Date().toLocaleTimeString("en-US", {
hour: "numeric",
minute: "2-digit",
second: "2-digit",
hour12: true,
});
console.log(`${formattedTime} [${source}] ${message}`);
}
export async function setupVite(app: Express, server: Server) {
const serverOptions = {
middlewareMode: true,
hmr: { server },
allowedHosts: true as const,
};
const vite = await createViteServer({
...viteConfig,
configFile: false,
customLogger: {
...viteLogger,
error: (msg, options) => {
viteLogger.error(msg, options);
process.exit(1);
},
},
server: serverOptions,
appType: "custom",
});
app.use(vite.middlewares);
app.use("*", async (req, res, next) => {
const url = req.originalUrl;
try {
const clientTemplate = path.resolve(
import.meta.dirname,
"..",
"client",
"index.html",
);
// always reload the index.html file from disk incase it changes
let template = await fs.promises.readFile(clientTemplate, "utf-8");
template = template.replace(
`src="/src/main.tsx"`,
`src="/src/main.tsx?v=${nanoid()}"`,
);
const page = await vite.transformIndexHtml(url, template);
res.status(200).set({ "Content-Type": "text/html" }).end(page);
} catch (e) {
vite.ssrFixStacktrace(e as Error);
next(e);
}
});
}
export function serveStatic(app: Express) {
const distPath = path.resolve(import.meta.dirname, "public");
if (!fs.existsSync(distPath)) {
throw new Error(
`Could not find the build directory: ${distPath}, make sure to build the client first`,
);
}
app.use(express.static(distPath));
// fall through to index.html if the file doesn't exist
app.use("*", (_req, res) => {
res.sendFile(path.resolve(distPath, "index.html"));
});
}