feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
92
server/index.ts
Normal file
92
server/index.ts
Normal file
@ -0,0 +1,92 @@
|
||||
import express, { type Request, Response, NextFunction } from "express";
|
||||
import { registerRoutes } from "./routes";
|
||||
import { setupVite, serveStatic, log } from "./vite";
|
||||
|
||||
const app = express();
|
||||
|
||||
// CORS middleware for Expo app
|
||||
app.use((req, res, next) => {
|
||||
res.header('Access-Control-Allow-Origin', '*');
|
||||
res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS, PATCH');
|
||||
res.header('Access-Control-Allow-Headers', 'Content-Type, Authorization');
|
||||
|
||||
// Handle preflight requests
|
||||
if (req.method === 'OPTIONS') {
|
||||
return res.sendStatus(200);
|
||||
}
|
||||
|
||||
next();
|
||||
});
|
||||
|
||||
app.use(express.json());
|
||||
app.use(express.urlencoded({ extended: false }));
|
||||
|
||||
app.use((req, res, next) => {
|
||||
const start = Date.now();
|
||||
const path = req.path;
|
||||
let capturedJsonResponse: Record<string, any> | undefined = undefined;
|
||||
|
||||
const originalResJson = res.json;
|
||||
res.json = function (bodyJson, ...args) {
|
||||
capturedJsonResponse = bodyJson;
|
||||
return originalResJson.apply(res, [bodyJson, ...args]);
|
||||
};
|
||||
|
||||
res.on("finish", () => {
|
||||
const duration = Date.now() - start;
|
||||
if (path.startsWith("/api")) {
|
||||
let logLine = `${req.method} ${path} ${res.statusCode} in ${duration}ms`;
|
||||
if (capturedJsonResponse) {
|
||||
logLine += ` :: ${JSON.stringify(capturedJsonResponse)}`;
|
||||
}
|
||||
|
||||
if (logLine.length > 80) {
|
||||
logLine = logLine.slice(0, 79) + "…";
|
||||
}
|
||||
|
||||
log(logLine);
|
||||
}
|
||||
});
|
||||
|
||||
next();
|
||||
});
|
||||
|
||||
(async () => {
|
||||
const server = await registerRoutes(app);
|
||||
|
||||
app.use((err: any, _req: Request, res: Response, _next: NextFunction) => {
|
||||
const status = err.status || err.statusCode || 500;
|
||||
const message = err.message || "Internal Server Error";
|
||||
|
||||
res.status(status).json({ message });
|
||||
throw err;
|
||||
});
|
||||
|
||||
// importantly only setup vite in development and after
|
||||
// setting up all the other routes so the catch-all route
|
||||
// doesn't interfere with the other routes
|
||||
const isProduction = app.get("env") === "production" || process.env.REPLIT_DEPLOYMENT === "1";
|
||||
|
||||
if (!isProduction) {
|
||||
await setupVite(app, server);
|
||||
} else {
|
||||
log("Setting up static file serving for production");
|
||||
|
||||
// Protect unmatched API routes from static file serving catch-all
|
||||
// This should come after all API routes are registered
|
||||
app.use('/api/*', (req, res) => {
|
||||
res.status(404).json({ error: `API endpoint not found: ${req.path}` });
|
||||
});
|
||||
|
||||
serveStatic(app);
|
||||
}
|
||||
|
||||
// ALWAYS serve the app on the port specified in the environment variable PORT
|
||||
// Other ports are firewalled. Default to 5000 if not specified.
|
||||
// this serves both the API and the client.
|
||||
// It is the only port that is not firewalled.
|
||||
const port = parseInt(process.env.PORT || '5000', 10);
|
||||
server.listen(port, "0.0.0.0", () => {
|
||||
log(`serving on port ${port}`);
|
||||
});
|
||||
})();
|
||||
532
server/newsapi-client.ts
Normal file
532
server/newsapi-client.ts
Normal file
@ -0,0 +1,532 @@
|
||||
/**
|
||||
* News-API Client
|
||||
*
|
||||
* Handles communication with news-api MongoDB and data transformation
|
||||
* to sapiens-mobile schema format.
|
||||
*/
|
||||
|
||||
import { MongoClient, Db, ObjectId } from 'mongodb';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
const MONGODB_URL = process.env.MONGODB_URL || 'mongodb://localhost:27017';
|
||||
const DB_NAME = 'ai_writer_db';
|
||||
|
||||
// Cache for outlets data (loaded from outlets-extracted.json)
|
||||
let outlets: {
|
||||
people: any[];
|
||||
topics: any[];
|
||||
companies: any[];
|
||||
} | null = null;
|
||||
|
||||
// MongoDB client
|
||||
let client: MongoClient | null = null;
|
||||
let db: Db | null = null;
|
||||
|
||||
/**
|
||||
* Initialize MongoDB connection
|
||||
*/
|
||||
export async function connectToNewsAPI(): Promise<void> {
|
||||
if (client) return; // Already connected
|
||||
|
||||
try {
|
||||
client = new MongoClient(MONGODB_URL);
|
||||
await client.connect();
|
||||
db = client.db(DB_NAME);
|
||||
console.log(`Connected to news-api MongoDB: ${DB_NAME}`);
|
||||
} catch (error) {
|
||||
console.error('Failed to connect to news-api MongoDB:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load outlets from outlets-extracted.json
|
||||
*/
|
||||
export function loadOutlets(): any {
|
||||
if (outlets) return outlets;
|
||||
|
||||
try {
|
||||
const outletsPath = path.resolve(process.cwd(), 'outlets-extracted.json');
|
||||
const data = fs.readFileSync(outletsPath, 'utf-8');
|
||||
outlets = JSON.parse(data);
|
||||
console.log(`Loaded outlets: ${outlets!.people.length} people, ${outlets!.topics.length} topics, ${outlets!.companies.length} companies`);
|
||||
return outlets;
|
||||
} catch (error) {
|
||||
console.error('Failed to load outlets from outlets-extracted.json:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Multi-language translations for outlet names
|
||||
const OUTLET_TRANSLATIONS: Record<string, Record<string, string>> = {
|
||||
'도널드-트럼프': {
|
||||
en: 'Donald Trump',
|
||||
ja: 'ドナルド・トランプ',
|
||||
zh_cn: '唐纳德·特朗普',
|
||||
zh_tw: '唐納德·川普',
|
||||
de: 'Donald Trump',
|
||||
fr: 'Donald Trump',
|
||||
es: 'Donald Trump',
|
||||
it: 'Donald Trump'
|
||||
},
|
||||
'온유': {
|
||||
en: 'Onew',
|
||||
ja: 'オンユ',
|
||||
zh_cn: '温流',
|
||||
zh_tw: '溫流',
|
||||
de: 'Onew',
|
||||
fr: 'Onew',
|
||||
es: 'Onew',
|
||||
it: 'Onew'
|
||||
},
|
||||
'사토시-나카모토': {
|
||||
en: 'Satoshi Nakamoto',
|
||||
ja: 'サトシ・ナカモト',
|
||||
zh_cn: '中本聪',
|
||||
zh_tw: '中本聰',
|
||||
de: 'Satoshi Nakamoto',
|
||||
fr: 'Satoshi Nakamoto',
|
||||
es: 'Satoshi Nakamoto',
|
||||
it: 'Satoshi Nakamoto'
|
||||
},
|
||||
'일론-머스크': {
|
||||
en: 'Elon Musk',
|
||||
ja: 'イーロン・マスク',
|
||||
zh_cn: '埃隆·马斯克',
|
||||
zh_tw: '伊隆·馬斯克',
|
||||
de: 'Elon Musk',
|
||||
fr: 'Elon Musk',
|
||||
es: 'Elon Musk',
|
||||
it: 'Elon Musk'
|
||||
},
|
||||
'매기-강': {
|
||||
en: 'Maggie Kang',
|
||||
ja: 'マギー・カン',
|
||||
zh_cn: '玛吉·姜',
|
||||
zh_tw: '瑪姬·姜',
|
||||
de: 'Maggie Kang',
|
||||
fr: 'Maggie Kang',
|
||||
es: 'Maggie Kang',
|
||||
it: 'Maggie Kang'
|
||||
},
|
||||
'제롬-파월': {
|
||||
en: 'Jerome Powell',
|
||||
ja: 'ジェローム・パウエル',
|
||||
zh_cn: '杰罗姆·鲍威尔',
|
||||
zh_tw: '傑羅姆·鮑威爾',
|
||||
de: 'Jerome Powell',
|
||||
fr: 'Jerome Powell',
|
||||
es: 'Jerome Powell',
|
||||
it: 'Jerome Powell'
|
||||
},
|
||||
'블라디미르-푸틴': {
|
||||
en: 'Vladimir Putin',
|
||||
ja: 'ウラジーミル・プーチン',
|
||||
zh_cn: '弗拉基米尔·普京',
|
||||
zh_tw: '弗拉基米爾·普丁',
|
||||
de: 'Wladimir Putin',
|
||||
fr: 'Vladimir Poutine',
|
||||
es: 'Vladímir Putin',
|
||||
it: 'Vladimir Putin'
|
||||
},
|
||||
'조-바이든': {
|
||||
en: 'Joe Biden',
|
||||
ja: 'ジョー・バイデン',
|
||||
zh_cn: '乔·拜登',
|
||||
zh_tw: '喬·拜登',
|
||||
de: 'Joe Biden',
|
||||
fr: 'Joe Biden',
|
||||
es: 'Joe Biden',
|
||||
it: 'Joe Biden'
|
||||
},
|
||||
'블랙핑크': {
|
||||
en: 'BLACKPINK',
|
||||
ja: 'ブラックピンク',
|
||||
zh_cn: 'BLACKPINK',
|
||||
zh_tw: 'BLACKPINK',
|
||||
de: 'BLACKPINK',
|
||||
fr: 'BLACKPINK',
|
||||
es: 'BLACKPINK',
|
||||
it: 'BLACKPINK'
|
||||
},
|
||||
'구글': {
|
||||
en: 'Google',
|
||||
ja: 'グーグル',
|
||||
zh_cn: '谷歌',
|
||||
zh_tw: '谷歌',
|
||||
de: 'Google',
|
||||
fr: 'Google',
|
||||
es: 'Google',
|
||||
it: 'Google'
|
||||
},
|
||||
'마이크로소프트': {
|
||||
en: 'Microsoft',
|
||||
ja: 'マイクロソフト',
|
||||
zh_cn: '微软',
|
||||
zh_tw: '微軟',
|
||||
de: 'Microsoft',
|
||||
fr: 'Microsoft',
|
||||
es: 'Microsoft',
|
||||
it: 'Microsoft'
|
||||
},
|
||||
'넷플릭스': {
|
||||
en: 'Netflix',
|
||||
ja: 'ネットフリックス',
|
||||
zh_cn: '奈飞',
|
||||
zh_tw: 'Netflix',
|
||||
de: 'Netflix',
|
||||
fr: 'Netflix',
|
||||
es: 'Netflix',
|
||||
it: 'Netflix'
|
||||
},
|
||||
'메타': {
|
||||
en: 'Meta',
|
||||
ja: 'メタ',
|
||||
zh_cn: 'Meta',
|
||||
zh_tw: 'Meta',
|
||||
de: 'Meta',
|
||||
fr: 'Meta',
|
||||
es: 'Meta',
|
||||
it: 'Meta'
|
||||
},
|
||||
'삼성전자': {
|
||||
en: 'Samsung Electronics',
|
||||
ja: 'サムスン電子',
|
||||
zh_cn: '三星电子',
|
||||
zh_tw: '三星電子',
|
||||
de: 'Samsung Electronics',
|
||||
fr: 'Samsung Electronics',
|
||||
es: 'Samsung Electronics',
|
||||
it: 'Samsung Electronics'
|
||||
},
|
||||
'아마존': {
|
||||
en: 'Amazon',
|
||||
ja: 'アマゾン',
|
||||
zh_cn: '亚马逊',
|
||||
zh_tw: '亞馬遜',
|
||||
de: 'Amazon',
|
||||
fr: 'Amazon',
|
||||
es: 'Amazon',
|
||||
it: 'Amazon'
|
||||
},
|
||||
'샤이니': {
|
||||
en: 'SHINee',
|
||||
ja: 'シャイニー',
|
||||
zh_cn: 'SHINee',
|
||||
zh_tw: 'SHINee',
|
||||
de: 'SHINee',
|
||||
fr: 'SHINee',
|
||||
es: 'SHINee',
|
||||
it: 'SHINee'
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Translate outlet name and description based on language
|
||||
*/
|
||||
function translateOutlet(outlet: any, language: string): any {
|
||||
// If Korean, return original
|
||||
if (language === 'ko') {
|
||||
return outlet;
|
||||
}
|
||||
|
||||
// Check if we have a translation for this outlet ID
|
||||
let displayName = outlet.name;
|
||||
if (OUTLET_TRANSLATIONS[outlet.id] && OUTLET_TRANSLATIONS[outlet.id][language]) {
|
||||
displayName = OUTLET_TRANSLATIONS[outlet.id][language];
|
||||
} else if (/[가-힣]/.test(outlet.name)) {
|
||||
// Fallback: If name contains Korean characters but no translation,
|
||||
// keep the Korean name as is
|
||||
displayName = outlet.name;
|
||||
}
|
||||
|
||||
// Translate description pattern
|
||||
const descriptionTranslations: Record<string, string> = {
|
||||
'en': `News and updates about ${displayName}`,
|
||||
'ja': `${displayName}に関するニュースと最新情報`,
|
||||
'zh_cn': `关于${displayName}的新闻和更新`,
|
||||
'zh_tw': `關於${displayName}的新聞和更新`,
|
||||
'de': `Nachrichten und Updates über ${displayName}`,
|
||||
'fr': `Actualités et mises à jour sur ${displayName}`,
|
||||
'es': `Noticias y actualizaciones sobre ${displayName}`,
|
||||
'it': `Notizie e aggiornamenti su ${displayName}`
|
||||
};
|
||||
|
||||
return {
|
||||
...outlet,
|
||||
name: displayName,
|
||||
description: descriptionTranslations[language] || outlet.description
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all outlets or by category
|
||||
*/
|
||||
export function getOutlets(category?: string, language = 'ko'): any[] {
|
||||
const allOutlets = loadOutlets();
|
||||
|
||||
// Add focusSubject to each outlet (using name as focusSubject)
|
||||
const addFocusSubject = (outlets: any[]) =>
|
||||
outlets.map(outlet => {
|
||||
const translated = translateOutlet(outlet, language);
|
||||
return {
|
||||
...translated,
|
||||
focusSubject: translated.name || translated.id,
|
||||
avatar: translated.image
|
||||
};
|
||||
});
|
||||
|
||||
if (!category) {
|
||||
return [
|
||||
...addFocusSubject(allOutlets.people),
|
||||
...addFocusSubject(allOutlets.topics),
|
||||
...addFocusSubject(allOutlets.companies)
|
||||
];
|
||||
}
|
||||
|
||||
switch (category) {
|
||||
case 'people':
|
||||
return addFocusSubject(allOutlets.people);
|
||||
case 'topics':
|
||||
return addFocusSubject(allOutlets.topics);
|
||||
case 'companies':
|
||||
return addFocusSubject(allOutlets.companies);
|
||||
default:
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get outlet by ID
|
||||
*/
|
||||
export function getOutletById(id: string, language = 'ko'): any | null {
|
||||
const allOutlets = getOutlets(undefined, language);
|
||||
const outlet = allOutlets.find(outlet => outlet.id === id);
|
||||
|
||||
if (!outlet) return null;
|
||||
|
||||
// Add focusSubject and avatar if not present
|
||||
return {
|
||||
...outlet,
|
||||
focusSubject: outlet.focusSubject || outlet.name || outlet.id,
|
||||
avatar: outlet.avatar || outlet.image
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get articles for an outlet
|
||||
*/
|
||||
export async function getArticlesByOutlet(outletId: string, limit = 50, language = 'en'): Promise<any[]> {
|
||||
await connectToNewsAPI();
|
||||
if (!db) throw new Error('Database not connected');
|
||||
|
||||
const outlet = getOutletById(outletId);
|
||||
if (!outlet) return [];
|
||||
|
||||
const articleIds = outlet.articles.slice(0, limit).map((id: string) => new ObjectId(id));
|
||||
|
||||
// First, get news_ids from English collection
|
||||
const enCollection = db.collection('articles_en');
|
||||
const enArticles = await enCollection.find({
|
||||
_id: { $in: articleIds }
|
||||
}, { projection: { news_id: 1, _id: 1 } }).toArray();
|
||||
|
||||
const newsIds = enArticles.map((a: any) => a.news_id).filter(Boolean);
|
||||
|
||||
if (newsIds.length === 0) return [];
|
||||
|
||||
// Then get articles from target language collection using news_ids
|
||||
const collectionName = `articles_${language}`;
|
||||
const collection = db.collection(collectionName);
|
||||
const articles = await collection.find({
|
||||
news_id: { $in: newsIds }
|
||||
}).toArray();
|
||||
|
||||
// Create a map from news_id to both article and English ID
|
||||
const newsIdToData = new Map(articles.map((a: any) => {
|
||||
const enArticle = enArticles.find(en => en.news_id === a.news_id);
|
||||
return [a.news_id, { article: a, englishId: enArticle?._id.toString() }];
|
||||
}));
|
||||
|
||||
// Sort articles in the same order as outlet.articles
|
||||
const sortedArticles = enArticles
|
||||
.map((en: any) => {
|
||||
const data = newsIdToData.get(en.news_id);
|
||||
return data ? { ...data.article, _englishId: data.englishId } : null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
|
||||
return sortedArticles.map(a => transformArticle(a, a._englishId));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get article by news_id (preferred for cross-language support)
|
||||
*/
|
||||
export async function getArticleByNewsId(newsId: string, language = 'en'): Promise<any | null> {
|
||||
await connectToNewsAPI();
|
||||
if (!db) throw new Error('Database not connected');
|
||||
|
||||
console.log(`[newsapi-client.getArticleByNewsId] newsId=${newsId}, language=${language}`);
|
||||
|
||||
const collectionName = `articles_${language}`;
|
||||
const collection = db.collection(collectionName);
|
||||
|
||||
const article = await collection.findOne({ news_id: newsId });
|
||||
|
||||
if (!article) {
|
||||
console.log(`[newsapi-client.getArticleByNewsId] Article not found in ${collectionName}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get English article ID for outlet lookup
|
||||
const enCollection = db.collection('articles_en');
|
||||
const enArticle = await enCollection.findOne(
|
||||
{ news_id: newsId },
|
||||
{ projection: { _id: 1 } }
|
||||
);
|
||||
|
||||
console.log(`[newsapi-client.getArticleByNewsId] Found article in ${collectionName}: ${article.title}`);
|
||||
return transformArticle(article, enArticle?._id.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get article by ID (for backward compatibility)
|
||||
*/
|
||||
export async function getArticleById(id: string, language = 'en'): Promise<any | null> {
|
||||
await connectToNewsAPI();
|
||||
if (!db) throw new Error('Database not connected');
|
||||
|
||||
console.log(`[newsapi-client.getArticleById] id=${id}, language=${language}`);
|
||||
|
||||
// First, try to find the article directly in the requested language collection
|
||||
const collectionName = `articles_${language}`;
|
||||
const collection = db.collection(collectionName);
|
||||
let article = await collection.findOne({ _id: new ObjectId(id) });
|
||||
|
||||
if (article) {
|
||||
console.log(`[newsapi-client.getArticleById] Found article directly in ${collectionName}: ${article.title}`);
|
||||
return transformArticle(article, id); // Pass the ID as englishArticleId
|
||||
}
|
||||
|
||||
// If not found, the ID might be from English collection
|
||||
// Try to find it in English collection and get its news_id
|
||||
const enCollection = db.collection('articles_en');
|
||||
const enArticle = await enCollection.findOne(
|
||||
{ _id: new ObjectId(id) },
|
||||
{ projection: { news_id: 1 } }
|
||||
);
|
||||
|
||||
console.log(`[newsapi-client.getArticleById] Checked English collection, news_id: ${enArticle?.news_id}`);
|
||||
|
||||
if (!enArticle || !enArticle.news_id) {
|
||||
console.log(`[newsapi-client.getArticleById] Article not found in any collection`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// If requesting English, get it from English collection
|
||||
if (language === 'en') {
|
||||
const enFullArticle = await enCollection.findOne({ _id: new ObjectId(id) });
|
||||
if (!enFullArticle) return null;
|
||||
console.log(`[newsapi-client.getArticleById] Returning English article: ${enFullArticle.title}`);
|
||||
return transformArticle(enFullArticle);
|
||||
}
|
||||
|
||||
// For other languages, get article using news_id
|
||||
console.log(`[newsapi-client.getArticleById] Querying ${collectionName} with news_id: ${enArticle.news_id}`);
|
||||
article = await collection.findOne({ news_id: enArticle.news_id });
|
||||
|
||||
if (!article) {
|
||||
console.log(`[newsapi-client.getArticleById] No article found in ${collectionName} with news_id ${enArticle.news_id}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`[newsapi-client.getArticleById] Found article in ${collectionName}: ${article.title}`);
|
||||
return transformArticle(article, id); // Pass the English ID
|
||||
}
|
||||
|
||||
/**
|
||||
* Search articles
|
||||
*/
|
||||
export async function searchArticles(query: string, limit = 20, language = 'en'): Promise<any[]> {
|
||||
await connectToNewsAPI();
|
||||
if (!db) throw new Error('Database not connected');
|
||||
|
||||
const collectionName = `articles_${language}`;
|
||||
const collection = db.collection(collectionName);
|
||||
const articles = await collection.find({
|
||||
$or: [
|
||||
{ title: { $regex: query, $options: 'i' } },
|
||||
{ summary: { $regex: query, $options: 'i' } },
|
||||
{ body: { $regex: query, $options: 'i' } }
|
||||
]
|
||||
}).limit(limit).toArray();
|
||||
|
||||
return articles.map(transformArticle);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transform news-api article to sapiens-mobile format
|
||||
*/
|
||||
function transformArticle(article: any, englishArticleId?: string): any {
|
||||
// Find which outlet this article belongs to
|
||||
// Use englishArticleId if provided (for non-English articles), otherwise use current article's _id
|
||||
const allOutlets = getOutlets();
|
||||
const articleIdStr = englishArticleId || article._id.toString();
|
||||
const outlet = allOutlets.find(o => o.articles.includes(articleIdStr));
|
||||
|
||||
// Extract the first image or use default
|
||||
const images = article.images || [];
|
||||
const thumbnail = images.length > 0 ? images[0] : '/api/assets/default-article.png';
|
||||
|
||||
// Format time ago
|
||||
const publishedAt = article.created_at || new Date();
|
||||
const now = new Date();
|
||||
const diffInMinutes = Math.floor((now.getTime() - new Date(publishedAt).getTime()) / 60000);
|
||||
const clampedMinutes = Math.max(1, Math.min(59, diffInMinutes));
|
||||
|
||||
// Ensure tags is always an array of strings
|
||||
let tags: string[] = [];
|
||||
if (article.subtopics) {
|
||||
if (Array.isArray(article.subtopics)) {
|
||||
tags = article.subtopics
|
||||
.map((t: any) => {
|
||||
if (typeof t === 'string') return t;
|
||||
if (t && typeof t === 'object' && t.title) return t.title;
|
||||
return null;
|
||||
})
|
||||
.filter((t: any) => t !== null);
|
||||
} else if (typeof article.subtopics === 'string') {
|
||||
tags = [article.subtopics];
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
id: article._id.toString(),
|
||||
newsId: article.news_id || article._id.toString(), // Add news_id for cross-language navigation
|
||||
title: article.title || 'Untitled',
|
||||
summary: article.summary || '',
|
||||
body: article.body || article.summary || '',
|
||||
thumbnail,
|
||||
publishedAt: publishedAt,
|
||||
timeAgo: `${clampedMinutes} min ago`,
|
||||
outletId: outlet?.id || 'unknown',
|
||||
outletName: outlet?.name || 'Unknown',
|
||||
tags,
|
||||
subtopics: article.subtopics || [],
|
||||
viewCount: 0,
|
||||
category: outlet?.category || 'topics'
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Close MongoDB connection
|
||||
*/
|
||||
export async function closeNewsAPIConnection(): Promise<void> {
|
||||
if (client) {
|
||||
await client.close();
|
||||
client = null;
|
||||
db = null;
|
||||
console.log('Closed news-api MongoDB connection');
|
||||
}
|
||||
}
|
||||
75
server/openai-utils.ts
Normal file
75
server/openai-utils.ts
Normal file
@ -0,0 +1,75 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
// the newest OpenAI model is "gpt-5" which was released August 7, 2025. do not change this unless explicitly requested by the user
|
||||
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
|
||||
|
||||
export interface MediaOutletProfile {
|
||||
name: string;
|
||||
category: string;
|
||||
focusSubject: string;
|
||||
bio: string;
|
||||
fullBio: string[];
|
||||
}
|
||||
|
||||
export async function generateDetailedProfile(outlet: MediaOutletProfile): Promise<string> {
|
||||
try {
|
||||
const prompt = `Create a comprehensive, Wikipedia-style profile for the following media outlet/subject. The profile should be detailed, extensive, and well-structured with multiple sections. Write in a neutral, encyclopedic tone similar to Wikipedia articles.
|
||||
|
||||
Media Outlet Information:
|
||||
- Name: ${outlet.name}
|
||||
- Category: ${outlet.category}
|
||||
- Focus Subject: ${outlet.focusSubject}
|
||||
- Current Bio: ${outlet.bio}
|
||||
- Additional Info: ${outlet.fullBio.join(' ')}
|
||||
|
||||
Please create a detailed profile that includes:
|
||||
|
||||
1. **Overview/Introduction** - Comprehensive introduction paragraph
|
||||
2. **Background & History** - Detailed history and formation
|
||||
3. **Key Achievements** - Major accomplishments and milestones
|
||||
4. **Technology & Innovation** - Technical aspects, innovations, or methodologies (if applicable)
|
||||
5. **Market Position & Influence** - Position in industry/market and influence
|
||||
6. **Notable Developments** - Significant events, partnerships, or developments
|
||||
7. **Future Outlook** - Current projects and future direction
|
||||
8. **Industry Impact** - Broader impact on the industry or field
|
||||
|
||||
For people: Include personal background, career history, education, major contributions, and influence.
|
||||
For companies: Include founding story, business model, products/services, market position, and key partnerships.
|
||||
For topics/technologies: Include technical background, development history, applications, and significance.
|
||||
|
||||
Make it comprehensive and informative, similar to a detailed Wikipedia article. Use HTML formatting with proper headings (h2, h3), paragraphs, and lists where appropriate. Aim for 2000-3000 words.
|
||||
|
||||
Respond with only the HTML content, no markdown or additional formatting.`;
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
model: "gpt-5",
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
max_completion_tokens: 4000
|
||||
});
|
||||
|
||||
return response.choices[0].message.content || "";
|
||||
} catch (error) {
|
||||
console.error("Error generating detailed profile:", error);
|
||||
throw new Error(`Failed to generate profile: ${error instanceof Error ? error.message : "Unknown error"}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function generateBatchProfiles(outlets: MediaOutletProfile[]): Promise<Record<string, string>> {
|
||||
const profiles: Record<string, string> = {};
|
||||
|
||||
for (const outlet of outlets) {
|
||||
try {
|
||||
console.log(`Generating profile for ${outlet.name}...`);
|
||||
const profile = await generateDetailedProfile(outlet);
|
||||
profiles[outlet.name] = profile;
|
||||
|
||||
// Add a small delay to avoid rate limiting
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
} catch (error) {
|
||||
console.error(`Failed to generate profile for ${outlet.name}:`, error);
|
||||
profiles[outlet.name] = `<h2>Profile Generation Error</h2><p>Unable to generate detailed profile for ${outlet.name}. Please try again later.</p>`;
|
||||
}
|
||||
}
|
||||
|
||||
return profiles;
|
||||
}
|
||||
315
server/outletParser.ts
Normal file
315
server/outletParser.ts
Normal file
@ -0,0 +1,315 @@
|
||||
import { readFileSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
|
||||
export interface OutletLinks {
|
||||
name: string;
|
||||
category: 'people' | 'topics' | 'companies';
|
||||
focusSubject: string;
|
||||
urls: string[];
|
||||
}
|
||||
|
||||
export interface ParsedOutlets {
|
||||
people: OutletLinks[];
|
||||
topics: OutletLinks[];
|
||||
companies: OutletLinks[];
|
||||
total: number;
|
||||
}
|
||||
|
||||
export class OutletParser {
|
||||
|
||||
// Parse the attached file and return structured outlet data
|
||||
static parseOutletFile(filePath: string): ParsedOutlets {
|
||||
try {
|
||||
console.log(`Parsing outlet file: ${filePath}`);
|
||||
|
||||
const content = readFileSync(filePath, 'utf-8');
|
||||
const lines = content.split('\n').map(line => line.trim()).filter(line => line);
|
||||
|
||||
const parsed: ParsedOutlets = {
|
||||
people: [],
|
||||
topics: [],
|
||||
companies: [],
|
||||
total: 0,
|
||||
};
|
||||
|
||||
let currentCategory: 'people' | 'topics' | 'companies' | null = null;
|
||||
let currentOutlet: OutletLinks | null = null;
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
|
||||
// Skip empty lines
|
||||
if (!line) continue;
|
||||
|
||||
// Detect section headers FIRST (before skipping other # lines)
|
||||
if (line.includes('## People')) {
|
||||
currentCategory = 'people';
|
||||
continue;
|
||||
} else if (line.includes('## Topics')) {
|
||||
currentCategory = 'topics';
|
||||
continue;
|
||||
} else if (line.includes('## Companies') || line.startsWith('📋 Companies')) {
|
||||
currentCategory = 'companies';
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip other markdown headers (after section detection)
|
||||
if (line.startsWith('#') && !line.startsWith('###')) continue;
|
||||
|
||||
// Parse outlet headers like "### 1. Ala Shaabana - Bittensor 공동창립자"
|
||||
if (line.startsWith('###') && currentCategory) {
|
||||
// Save previous outlet
|
||||
if (currentOutlet && currentOutlet.urls.length > 0) {
|
||||
parsed[currentOutlet.category].push(currentOutlet);
|
||||
parsed.total++;
|
||||
}
|
||||
|
||||
// Extract outlet name (remove ### and number)
|
||||
const nameMatch = line.match(/###\s*\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
|
||||
if (nameMatch) {
|
||||
const rawName = nameMatch[1].trim();
|
||||
const cleanedName = this.cleanOutletName(rawName);
|
||||
|
||||
currentOutlet = {
|
||||
name: cleanedName,
|
||||
category: currentCategory,
|
||||
focusSubject: this.generateFocusSubject(cleanedName),
|
||||
urls: []
|
||||
};
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse numbered URLs like "1. https://example.com"
|
||||
const urlMatch = line.match(/^\d+\.\s*(https?:\/\/.+)$/);
|
||||
if (urlMatch && currentOutlet) {
|
||||
currentOutlet.urls.push(urlMatch[1]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse direct URLs for companies section
|
||||
if (line.startsWith('http://') || line.startsWith('https://')) {
|
||||
if (currentOutlet) {
|
||||
currentOutlet.urls.push(line);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse company entries like "1. Ava Labs (Avalanche 플랫폼)"
|
||||
if (currentCategory === 'companies' && /^\d+\.\s*[A-Za-z]/.test(line) && !line.startsWith('http')) {
|
||||
// Save previous outlet
|
||||
if (currentOutlet && currentOutlet.urls.length > 0) {
|
||||
parsed[currentOutlet.category].push(currentOutlet);
|
||||
parsed.total++;
|
||||
}
|
||||
|
||||
const companyMatch = line.match(/^\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
|
||||
if (companyMatch) {
|
||||
const rawName = companyMatch[1].trim();
|
||||
const cleanedName = this.cleanOutletName(rawName);
|
||||
|
||||
currentOutlet = {
|
||||
name: cleanedName,
|
||||
category: 'companies',
|
||||
focusSubject: this.generateFocusSubject(cleanedName),
|
||||
urls: []
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Don't forget the last outlet
|
||||
if (currentOutlet && currentOutlet.urls.length > 0) {
|
||||
parsed[currentOutlet.category].push(currentOutlet);
|
||||
parsed.total++;
|
||||
}
|
||||
|
||||
console.log(`Successfully parsed ${parsed.total} outlets:`);
|
||||
console.log(`- People: ${parsed.people.length}`);
|
||||
console.log(`- Topics: ${parsed.topics.length}`);
|
||||
console.log(`- Companies: ${parsed.companies.length}`);
|
||||
|
||||
return parsed;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Error parsing outlet file:', error.message);
|
||||
throw new Error(`Failed to parse outlet file: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Clean outlet names by removing Korean descriptions and normalizing
|
||||
private static cleanOutletName(rawName: string): string {
|
||||
// Remove Korean parenthetical descriptions like "(연방준비제도 의장)" or "(OSS Capital 창립자)"
|
||||
let cleaned = rawName.replace(/\s*\([^)]*\)/g, '').trim();
|
||||
|
||||
// Handle special cases
|
||||
const specialCases: { [key: string]: string } = {
|
||||
'CBDC': 'Central Bank Digital Currency',
|
||||
'CFTC': 'Commodity Futures Trading Commission',
|
||||
'SEC': 'Securities and Exchange Commission',
|
||||
'DAT': 'Digital Asset Treasury',
|
||||
'DeFi': 'Decentralized Finance',
|
||||
'DEX': 'Decentralized Exchange',
|
||||
'NFT': 'Non-Fungible Token',
|
||||
'RWA': 'Real World Assets',
|
||||
'SWF': 'Sovereign Wealth Fund',
|
||||
};
|
||||
|
||||
return specialCases[cleaned] || cleaned;
|
||||
}
|
||||
|
||||
// Generate focus subject for database compatibility
|
||||
private static generateFocusSubject(rawName: string): string {
|
||||
let subject = rawName.replace(/\s*\([^)]*\)/g, '').trim();
|
||||
|
||||
// Convert to lowercase and replace spaces with dashes for ID compatibility
|
||||
return subject.toLowerCase()
|
||||
.replace(/[^a-z0-9\s-]/g, '') // Remove special characters
|
||||
.replace(/\s+/g, '-') // Replace spaces with dashes
|
||||
.replace(/--+/g, '-') // Replace multiple dashes with single dash
|
||||
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
|
||||
}
|
||||
|
||||
// Categorize outlet based on name
|
||||
private static categorizeOutlet(name: string): 'people' | 'topics' | 'companies' {
|
||||
const cleanName = name.toLowerCase().trim();
|
||||
|
||||
// People (individual names)
|
||||
const people = [
|
||||
'ala shaabana', 'alex karp', 'arthur hayes', 'donald trump jr', 'eric trump',
|
||||
'jacob robert steeves', 'jared kushner', 'j.d. vance', 'jensen huang',
|
||||
'jerome powell', 'joseph jacks', 'robert myers', 'yat siu'
|
||||
];
|
||||
|
||||
// Companies
|
||||
const companies = [
|
||||
'xtao', 'yuma', 'taox', 'oblong', 'ava labs', 'boston dynamics',
|
||||
'blackrock', 'chainlink', 'circle', 'cme group', 'manifold labs'
|
||||
];
|
||||
|
||||
// Check for exact matches first
|
||||
if (people.some(person => cleanName.includes(person) || person.includes(cleanName))) {
|
||||
return 'people';
|
||||
}
|
||||
|
||||
if (companies.some(company => cleanName.includes(company) || company.includes(cleanName))) {
|
||||
return 'companies';
|
||||
}
|
||||
|
||||
// Everything else goes to topics
|
||||
return 'topics';
|
||||
}
|
||||
|
||||
// Get specific outlet data by name
|
||||
static getOutletByName(parsed: ParsedOutlets, name: string): OutletLinks | null {
|
||||
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
|
||||
return allOutlets.find(outlet =>
|
||||
outlet.name.toLowerCase() === name.toLowerCase() ||
|
||||
outlet.focusSubject === name
|
||||
) || null;
|
||||
}
|
||||
|
||||
// Get all URLs from parsed data
|
||||
static getAllUrls(parsed: ParsedOutlets): string[] {
|
||||
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
|
||||
return allOutlets.flatMap(outlet => outlet.urls);
|
||||
}
|
||||
|
||||
// Get URLs by category
|
||||
static getUrlsByCategory(parsed: ParsedOutlets, category: 'people' | 'topics' | 'companies'): string[] {
|
||||
return parsed[category].flatMap(outlet => outlet.urls);
|
||||
}
|
||||
|
||||
// Convert parsed data to our existing outlet format
|
||||
static convertToOutletFormat(parsed: ParsedOutlets): Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
category: string;
|
||||
focusSubject: string;
|
||||
avatar?: string;
|
||||
profileImage?: string;
|
||||
bio: string;
|
||||
fullBio?: string[];
|
||||
urls: string[];
|
||||
}> {
|
||||
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
|
||||
|
||||
return allOutlets.map(outlet => ({
|
||||
id: outlet.focusSubject,
|
||||
name: outlet.name,
|
||||
description: this.generateDescription(outlet),
|
||||
category: outlet.category,
|
||||
focusSubject: outlet.focusSubject,
|
||||
avatar: this.getDefaultAvatar(outlet.category),
|
||||
profileImage: this.getDefaultProfileImage(outlet.category),
|
||||
bio: this.generateBio(outlet),
|
||||
fullBio: this.generateFullBio(outlet),
|
||||
urls: outlet.urls,
|
||||
}));
|
||||
}
|
||||
|
||||
private static generateDescription(outlet: OutletLinks): string {
|
||||
const descriptions = {
|
||||
people: `Latest news and analysis about ${outlet.name}`,
|
||||
topics: `Comprehensive coverage of ${outlet.name} developments and trends`,
|
||||
companies: `${outlet.name} news, updates, and market analysis`,
|
||||
};
|
||||
|
||||
return descriptions[outlet.category];
|
||||
}
|
||||
|
||||
private static getDefaultAvatar(category: string): string {
|
||||
const avatars = {
|
||||
people: '/api/assets/default-person.jpg',
|
||||
topics: '/api/assets/default-topic.jpg',
|
||||
companies: '/api/assets/default-company.jpg',
|
||||
};
|
||||
|
||||
return avatars[category as keyof typeof avatars] || avatars.topics;
|
||||
}
|
||||
|
||||
private static getDefaultProfileImage(category: string): string {
|
||||
return this.getDefaultAvatar(category);
|
||||
}
|
||||
|
||||
private static generateBio(outlet: OutletLinks): string {
|
||||
const bios = {
|
||||
people: `${outlet.name} is a prominent figure in technology and business, making headlines with strategic decisions and market insights.`,
|
||||
topics: `Stay informed about the latest developments in ${outlet.name} with comprehensive coverage and expert analysis.`,
|
||||
companies: `${outlet.name} continues to shape the industry with innovative solutions and strategic partnerships.`,
|
||||
};
|
||||
|
||||
return bios[outlet.category];
|
||||
}
|
||||
|
||||
private static generateFullBio(outlet: OutletLinks): string[] {
|
||||
const fullBios = {
|
||||
people: [
|
||||
`${outlet.name} is a key figure in the technology and business landscape.`,
|
||||
`Known for strategic leadership and innovative thinking in their field.`,
|
||||
`Continues to influence industry trends and developments globally.`
|
||||
],
|
||||
topics: [
|
||||
`${outlet.name} represents a critical area of technological advancement.`,
|
||||
`Dynamic sector with ongoing market trends, regulatory updates, and innovations.`,
|
||||
`Comprehensive resource requiring expert analysis from leading industry professionals.`
|
||||
],
|
||||
companies: [
|
||||
`${outlet.name} is a significant player in the technology industry.`,
|
||||
`Known for innovative products and strategic market positioning.`,
|
||||
`Continues to drive industry growth and technological advancement.`
|
||||
]
|
||||
};
|
||||
|
||||
return fullBios[outlet.category];
|
||||
}
|
||||
}
|
||||
|
||||
// Utility function to parse the specific file
|
||||
export function parseAttachedOutletFile(): ParsedOutlets {
|
||||
const filePath = join(process.cwd(), 'attached_assets', 'Pasted-Ala-Shaabana-https-www-rootdata-com-news-323625-https-ffnews-com-newsarticle-funding-xtao-tsx-v-1758557992922_1758557992922.txt');
|
||||
return OutletParser.parseOutletFile(filePath);
|
||||
}
|
||||
|
||||
export default OutletParser;
|
||||
1772
server/routes.ts
Normal file
1772
server/routes.ts
Normal file
File diff suppressed because it is too large
Load Diff
445
server/scraper.ts
Normal file
445
server/scraper.ts
Normal file
@ -0,0 +1,445 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { createWriteStream, existsSync, mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import sharp from 'sharp';
|
||||
|
||||
interface ScrapedArticle {
|
||||
url: string;
|
||||
title: string;
|
||||
summary: string;
|
||||
body: string;
|
||||
imageUrl?: string;
|
||||
publishedAt: Date;
|
||||
author?: string;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
interface ScrapedImage {
|
||||
url: string;
|
||||
filename: string;
|
||||
alt?: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
}
|
||||
|
||||
export class WebScraper {
|
||||
private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
|
||||
private static readonly TIMEOUT = 30000; // 30 seconds
|
||||
private static readonly MAX_RETRIES = 3;
|
||||
|
||||
constructor() {
|
||||
// Create assets directory if it doesn't exist
|
||||
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
||||
if (!existsSync(assetsDir)) {
|
||||
mkdirSync(assetsDir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
// Main scraping method with retry logic
|
||||
async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
|
||||
try {
|
||||
console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
|
||||
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
'User-Agent': WebScraper.USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
timeout: WebScraper.TIMEOUT,
|
||||
maxRedirects: 5,
|
||||
});
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
// Extract article data using multiple selectors for different news sites
|
||||
const article = await this.extractArticleData($, url);
|
||||
|
||||
if (!article) {
|
||||
console.warn(`Failed to extract article data from ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`Successfully scraped: ${article.title}`);
|
||||
return article;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error(`Error scraping ${url}:`, error.message);
|
||||
|
||||
if (retryCount < WebScraper.MAX_RETRIES) {
|
||||
console.log(`Retrying ${url} in 2 seconds...`);
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
return this.scrapeArticle(url, retryCount + 1);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract article data with fallback selectors for different news sites
|
||||
private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
|
||||
const title = this.extractTitle($);
|
||||
const summary = this.extractSummary($);
|
||||
const body = this.extractBody($);
|
||||
const imageUrl = this.extractMainImage($, url);
|
||||
const publishedAt = this.extractPublishedDate($);
|
||||
const author = this.extractAuthor($);
|
||||
const tags = this.extractTags($);
|
||||
|
||||
if (!title || !body) {
|
||||
console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
summary: summary || (await this.generateSummaryFromBody(body, title)),
|
||||
body,
|
||||
imageUrl,
|
||||
publishedAt: publishedAt || new Date(),
|
||||
author,
|
||||
tags,
|
||||
};
|
||||
}
|
||||
|
||||
private extractTitle($: cheerio.CheerioAPI): string {
|
||||
// Try multiple common title selectors
|
||||
const selectors = [
|
||||
'h1.article-title',
|
||||
'h1.entry-title',
|
||||
'h1[class*="headline"]',
|
||||
'h1[class*="title"]',
|
||||
'.article-header h1',
|
||||
'.post-title',
|
||||
'h1',
|
||||
'title',
|
||||
'[property="og:title"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length && element.text().trim()) {
|
||||
return element.first().text().trim();
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractSummary($: cheerio.CheerioAPI): string {
|
||||
const selectors = [
|
||||
'.article-summary',
|
||||
'.entry-summary',
|
||||
'.article-excerpt',
|
||||
'.post-excerpt',
|
||||
'[class*="summary"]',
|
||||
'[class*="excerpt"]',
|
||||
'[property="og:description"]',
|
||||
'[name="description"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
const text = selector.includes('property') || selector.includes('name')
|
||||
? element.attr('content')
|
||||
: element.text();
|
||||
if (text && text.trim()) {
|
||||
return text.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractBody($: cheerio.CheerioAPI): string {
|
||||
const selectors = [
|
||||
'.article-content',
|
||||
'.entry-content',
|
||||
'.post-content',
|
||||
'.article-body',
|
||||
'[class*="content"]',
|
||||
'.story-body',
|
||||
'.article p',
|
||||
'.post p',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = $(selector);
|
||||
if (elements.length) {
|
||||
// Get text from all paragraphs and join them
|
||||
let bodyText = '';
|
||||
elements.each((_, el) => {
|
||||
const text = $(el).text().trim();
|
||||
if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
|
||||
bodyText += text + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
if (bodyText.length > 200) { // Ensure we have substantial content
|
||||
return bodyText.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
|
||||
const selectors = [
|
||||
'.article-image img',
|
||||
'.featured-image img',
|
||||
'[class*="hero"] img',
|
||||
'.post-thumbnail img',
|
||||
'[property="og:image"]',
|
||||
'meta[property="og:image"]',
|
||||
'.article img:first',
|
||||
'img[class*="featured"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
let src = selector.includes('property') || selector.includes('meta')
|
||||
? element.attr('content')
|
||||
: element.attr('src');
|
||||
|
||||
if (src) {
|
||||
// Convert relative URLs to absolute
|
||||
if (src.startsWith('//')) {
|
||||
src = 'https:' + src;
|
||||
} else if (src.startsWith('/')) {
|
||||
const urlObj = new URL(baseUrl);
|
||||
src = `${urlObj.origin}${src}`;
|
||||
}
|
||||
|
||||
if (src.startsWith('http')) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
|
||||
const selectors = [
|
||||
'[property="article:published_time"]',
|
||||
'[name="publish-date"]',
|
||||
'.publish-date',
|
||||
'.article-date',
|
||||
'.entry-date',
|
||||
'time[datetime]',
|
||||
'[class*="date"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
let dateStr = element.attr('content') || element.attr('datetime') || element.text();
|
||||
|
||||
if (dateStr) {
|
||||
const date = new Date(dateStr);
|
||||
if (!isNaN(date.getTime())) {
|
||||
return date;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
|
||||
const selectors = [
|
||||
'[rel="author"]',
|
||||
'.author-name',
|
||||
'.byline',
|
||||
'[class*="author"]',
|
||||
'[property="article:author"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
const author = selector.includes('property')
|
||||
? element.attr('content')
|
||||
: element.text();
|
||||
if (author && author.trim()) {
|
||||
return author.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractTags($: cheerio.CheerioAPI): string[] {
|
||||
const tags = new Set<string>();
|
||||
|
||||
// Extract from various tag selectors
|
||||
const selectors = [
|
||||
'.tags a',
|
||||
'.tag-list a',
|
||||
'[class*="tag"] a',
|
||||
'.categories a',
|
||||
'[rel="tag"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
$(selector).each((_, el) => {
|
||||
const tag = $(el).text().trim();
|
||||
if (tag) {
|
||||
tags.add(tag);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return Array.from(tags);
|
||||
}
|
||||
|
||||
private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
|
||||
try {
|
||||
// Try AI-powered summary generation first
|
||||
const response = await axios.post('http://localhost:5000/api/generate-summary', {
|
||||
content: body,
|
||||
title: title || ''
|
||||
}, {
|
||||
timeout: 10000, // 10 second timeout
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.data?.summary) {
|
||||
console.log('✅ AI-generated summary created');
|
||||
return response.data.summary;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
|
||||
}
|
||||
|
||||
// Fallback to basic summary generation (improved version)
|
||||
const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
|
||||
const sentences = cleanedBody.split(/[.!?]+/);
|
||||
let summary = '';
|
||||
|
||||
for (const sentence of sentences) {
|
||||
const trimmed = sentence.trim();
|
||||
if (trimmed.length < 10) continue; // Skip very short sentences
|
||||
if (summary.length + trimmed.length > 150) break;
|
||||
summary += (summary ? '. ' : '') + trimmed;
|
||||
}
|
||||
|
||||
return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
|
||||
}
|
||||
|
||||
// Download and process images
|
||||
async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
|
||||
try {
|
||||
console.log(`Downloading image: ${imageUrl}`);
|
||||
|
||||
const response = await axios.get(imageUrl, {
|
||||
responseType: 'stream',
|
||||
headers: {
|
||||
'User-Agent': WebScraper.USER_AGENT,
|
||||
},
|
||||
timeout: WebScraper.TIMEOUT,
|
||||
});
|
||||
|
||||
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
||||
const imagePath = join(assetsDir, filename);
|
||||
|
||||
const writer = createWriteStream(imagePath);
|
||||
response.data.pipe(writer);
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
writer.on('finish', resolve);
|
||||
writer.on('error', reject);
|
||||
});
|
||||
|
||||
// Get image metadata
|
||||
const metadata = await sharp(imagePath).metadata();
|
||||
|
||||
console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
|
||||
|
||||
return {
|
||||
url: imageUrl,
|
||||
filename,
|
||||
width: metadata.width,
|
||||
height: metadata.height,
|
||||
};
|
||||
|
||||
} catch (error: any) {
|
||||
console.error(`Error downloading image ${imageUrl}:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Create thumbnail from downloaded image
|
||||
async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
|
||||
try {
|
||||
await sharp(imagePath)
|
||||
.resize(size, size, {
|
||||
fit: 'cover',
|
||||
position: 'center',
|
||||
})
|
||||
.jpeg({ quality: 80 })
|
||||
.toFile(thumbnailPath);
|
||||
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
console.error(`Error creating thumbnail:`, error.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch scraping with concurrency control - returns both successes and failures
|
||||
async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
|
||||
successes: ScrapedArticle[];
|
||||
failures: Array<{url: string; error: string}>;
|
||||
}> {
|
||||
const successes: ScrapedArticle[] = [];
|
||||
const failures: Array<{url: string; error: string}> = [];
|
||||
const chunks = this.chunkArray(urls, maxConcurrency);
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const promises = chunk.map(url =>
|
||||
this.scrapeArticle(url)
|
||||
.then(result => ({ url, result, error: null }))
|
||||
.catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
|
||||
);
|
||||
|
||||
const chunkResults = await Promise.all(promises);
|
||||
|
||||
// Separate successes and failures
|
||||
for (const { url, result, error } of chunkResults) {
|
||||
if (result) {
|
||||
successes.push(result);
|
||||
} else {
|
||||
failures.push({ url, error: error || 'Failed to scrape' });
|
||||
}
|
||||
}
|
||||
|
||||
// Small delay between batches to be respectful to servers
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
|
||||
return { successes, failures };
|
||||
}
|
||||
|
||||
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
|
||||
const chunks: T[][] = [];
|
||||
for (let i = 0; i < array.length; i += chunkSize) {
|
||||
chunks.push(array.slice(i, i + chunkSize));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
1969
server/storage.ts
Normal file
1969
server/storage.ts
Normal file
File diff suppressed because it is too large
Load Diff
85
server/vite.ts
Normal file
85
server/vite.ts
Normal file
@ -0,0 +1,85 @@
|
||||
import express, { type Express } from "express";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { createServer as createViteServer, createLogger } from "vite";
|
||||
import { type Server } from "http";
|
||||
import viteConfig from "../vite.config";
|
||||
import { nanoid } from "nanoid";
|
||||
|
||||
const viteLogger = createLogger();
|
||||
|
||||
export function log(message: string, source = "express") {
|
||||
const formattedTime = new Date().toLocaleTimeString("en-US", {
|
||||
hour: "numeric",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
hour12: true,
|
||||
});
|
||||
|
||||
console.log(`${formattedTime} [${source}] ${message}`);
|
||||
}
|
||||
|
||||
export async function setupVite(app: Express, server: Server) {
|
||||
const serverOptions = {
|
||||
middlewareMode: true,
|
||||
hmr: { server },
|
||||
allowedHosts: true as const,
|
||||
};
|
||||
|
||||
const vite = await createViteServer({
|
||||
...viteConfig,
|
||||
configFile: false,
|
||||
customLogger: {
|
||||
...viteLogger,
|
||||
error: (msg, options) => {
|
||||
viteLogger.error(msg, options);
|
||||
process.exit(1);
|
||||
},
|
||||
},
|
||||
server: serverOptions,
|
||||
appType: "custom",
|
||||
});
|
||||
|
||||
app.use(vite.middlewares);
|
||||
app.use("*", async (req, res, next) => {
|
||||
const url = req.originalUrl;
|
||||
|
||||
try {
|
||||
const clientTemplate = path.resolve(
|
||||
import.meta.dirname,
|
||||
"..",
|
||||
"client",
|
||||
"index.html",
|
||||
);
|
||||
|
||||
// always reload the index.html file from disk incase it changes
|
||||
let template = await fs.promises.readFile(clientTemplate, "utf-8");
|
||||
template = template.replace(
|
||||
`src="/src/main.tsx"`,
|
||||
`src="/src/main.tsx?v=${nanoid()}"`,
|
||||
);
|
||||
const page = await vite.transformIndexHtml(url, template);
|
||||
res.status(200).set({ "Content-Type": "text/html" }).end(page);
|
||||
} catch (e) {
|
||||
vite.ssrFixStacktrace(e as Error);
|
||||
next(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export function serveStatic(app: Express) {
|
||||
const distPath = path.resolve(import.meta.dirname, "public");
|
||||
|
||||
if (!fs.existsSync(distPath)) {
|
||||
throw new Error(
|
||||
`Could not find the build directory: ${distPath}, make sure to build the client first`,
|
||||
);
|
||||
}
|
||||
|
||||
app.use(express.static(distPath));
|
||||
|
||||
// fall through to index.html if the file doesn't exist
|
||||
app.use("*", (_req, res) => {
|
||||
res.sendFile(path.resolve(distPath, "index.html"));
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user