feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
445
server/scraper.ts
Normal file
445
server/scraper.ts
Normal file
@ -0,0 +1,445 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { createWriteStream, existsSync, mkdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import sharp from 'sharp';
|
||||
|
||||
interface ScrapedArticle {
|
||||
url: string;
|
||||
title: string;
|
||||
summary: string;
|
||||
body: string;
|
||||
imageUrl?: string;
|
||||
publishedAt: Date;
|
||||
author?: string;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
interface ScrapedImage {
|
||||
url: string;
|
||||
filename: string;
|
||||
alt?: string;
|
||||
width?: number;
|
||||
height?: number;
|
||||
}
|
||||
|
||||
export class WebScraper {
|
||||
private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
|
||||
private static readonly TIMEOUT = 30000; // 30 seconds
|
||||
private static readonly MAX_RETRIES = 3;
|
||||
|
||||
constructor() {
|
||||
// Create assets directory if it doesn't exist
|
||||
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
||||
if (!existsSync(assetsDir)) {
|
||||
mkdirSync(assetsDir, { recursive: true });
|
||||
}
|
||||
}
|
||||
|
||||
// Main scraping method with retry logic
|
||||
async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
|
||||
try {
|
||||
console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
|
||||
|
||||
const response = await axios.get(url, {
|
||||
headers: {
|
||||
'User-Agent': WebScraper.USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
},
|
||||
timeout: WebScraper.TIMEOUT,
|
||||
maxRedirects: 5,
|
||||
});
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
// Extract article data using multiple selectors for different news sites
|
||||
const article = await this.extractArticleData($, url);
|
||||
|
||||
if (!article) {
|
||||
console.warn(`Failed to extract article data from ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`Successfully scraped: ${article.title}`);
|
||||
return article;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error(`Error scraping ${url}:`, error.message);
|
||||
|
||||
if (retryCount < WebScraper.MAX_RETRIES) {
|
||||
console.log(`Retrying ${url} in 2 seconds...`);
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
return this.scrapeArticle(url, retryCount + 1);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract article data with fallback selectors for different news sites
|
||||
private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
|
||||
const title = this.extractTitle($);
|
||||
const summary = this.extractSummary($);
|
||||
const body = this.extractBody($);
|
||||
const imageUrl = this.extractMainImage($, url);
|
||||
const publishedAt = this.extractPublishedDate($);
|
||||
const author = this.extractAuthor($);
|
||||
const tags = this.extractTags($);
|
||||
|
||||
if (!title || !body) {
|
||||
console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
url,
|
||||
title,
|
||||
summary: summary || (await this.generateSummaryFromBody(body, title)),
|
||||
body,
|
||||
imageUrl,
|
||||
publishedAt: publishedAt || new Date(),
|
||||
author,
|
||||
tags,
|
||||
};
|
||||
}
|
||||
|
||||
private extractTitle($: cheerio.CheerioAPI): string {
|
||||
// Try multiple common title selectors
|
||||
const selectors = [
|
||||
'h1.article-title',
|
||||
'h1.entry-title',
|
||||
'h1[class*="headline"]',
|
||||
'h1[class*="title"]',
|
||||
'.article-header h1',
|
||||
'.post-title',
|
||||
'h1',
|
||||
'title',
|
||||
'[property="og:title"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length && element.text().trim()) {
|
||||
return element.first().text().trim();
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractSummary($: cheerio.CheerioAPI): string {
|
||||
const selectors = [
|
||||
'.article-summary',
|
||||
'.entry-summary',
|
||||
'.article-excerpt',
|
||||
'.post-excerpt',
|
||||
'[class*="summary"]',
|
||||
'[class*="excerpt"]',
|
||||
'[property="og:description"]',
|
||||
'[name="description"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
const text = selector.includes('property') || selector.includes('name')
|
||||
? element.attr('content')
|
||||
: element.text();
|
||||
if (text && text.trim()) {
|
||||
return text.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractBody($: cheerio.CheerioAPI): string {
|
||||
const selectors = [
|
||||
'.article-content',
|
||||
'.entry-content',
|
||||
'.post-content',
|
||||
'.article-body',
|
||||
'[class*="content"]',
|
||||
'.story-body',
|
||||
'.article p',
|
||||
'.post p',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const elements = $(selector);
|
||||
if (elements.length) {
|
||||
// Get text from all paragraphs and join them
|
||||
let bodyText = '';
|
||||
elements.each((_, el) => {
|
||||
const text = $(el).text().trim();
|
||||
if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
|
||||
bodyText += text + '\n\n';
|
||||
}
|
||||
});
|
||||
|
||||
if (bodyText.length > 200) { // Ensure we have substantial content
|
||||
return bodyText.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
|
||||
const selectors = [
|
||||
'.article-image img',
|
||||
'.featured-image img',
|
||||
'[class*="hero"] img',
|
||||
'.post-thumbnail img',
|
||||
'[property="og:image"]',
|
||||
'meta[property="og:image"]',
|
||||
'.article img:first',
|
||||
'img[class*="featured"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
let src = selector.includes('property') || selector.includes('meta')
|
||||
? element.attr('content')
|
||||
: element.attr('src');
|
||||
|
||||
if (src) {
|
||||
// Convert relative URLs to absolute
|
||||
if (src.startsWith('//')) {
|
||||
src = 'https:' + src;
|
||||
} else if (src.startsWith('/')) {
|
||||
const urlObj = new URL(baseUrl);
|
||||
src = `${urlObj.origin}${src}`;
|
||||
}
|
||||
|
||||
if (src.startsWith('http')) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
|
||||
const selectors = [
|
||||
'[property="article:published_time"]',
|
||||
'[name="publish-date"]',
|
||||
'.publish-date',
|
||||
'.article-date',
|
||||
'.entry-date',
|
||||
'time[datetime]',
|
||||
'[class*="date"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
let dateStr = element.attr('content') || element.attr('datetime') || element.text();
|
||||
|
||||
if (dateStr) {
|
||||
const date = new Date(dateStr);
|
||||
if (!isNaN(date.getTime())) {
|
||||
return date;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
|
||||
const selectors = [
|
||||
'[rel="author"]',
|
||||
'.author-name',
|
||||
'.byline',
|
||||
'[class*="author"]',
|
||||
'[property="article:author"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = $(selector);
|
||||
if (element.length) {
|
||||
const author = selector.includes('property')
|
||||
? element.attr('content')
|
||||
: element.text();
|
||||
if (author && author.trim()) {
|
||||
return author.trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private extractTags($: cheerio.CheerioAPI): string[] {
|
||||
const tags = new Set<string>();
|
||||
|
||||
// Extract from various tag selectors
|
||||
const selectors = [
|
||||
'.tags a',
|
||||
'.tag-list a',
|
||||
'[class*="tag"] a',
|
||||
'.categories a',
|
||||
'[rel="tag"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
$(selector).each((_, el) => {
|
||||
const tag = $(el).text().trim();
|
||||
if (tag) {
|
||||
tags.add(tag);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return Array.from(tags);
|
||||
}
|
||||
|
||||
private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
|
||||
try {
|
||||
// Try AI-powered summary generation first
|
||||
const response = await axios.post('http://localhost:5000/api/generate-summary', {
|
||||
content: body,
|
||||
title: title || ''
|
||||
}, {
|
||||
timeout: 10000, // 10 second timeout
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
|
||||
if (response.data?.summary) {
|
||||
console.log('✅ AI-generated summary created');
|
||||
return response.data.summary;
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
|
||||
}
|
||||
|
||||
// Fallback to basic summary generation (improved version)
|
||||
const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
|
||||
const sentences = cleanedBody.split(/[.!?]+/);
|
||||
let summary = '';
|
||||
|
||||
for (const sentence of sentences) {
|
||||
const trimmed = sentence.trim();
|
||||
if (trimmed.length < 10) continue; // Skip very short sentences
|
||||
if (summary.length + trimmed.length > 150) break;
|
||||
summary += (summary ? '. ' : '') + trimmed;
|
||||
}
|
||||
|
||||
return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
|
||||
}
|
||||
|
||||
// Download and process images
|
||||
async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
|
||||
try {
|
||||
console.log(`Downloading image: ${imageUrl}`);
|
||||
|
||||
const response = await axios.get(imageUrl, {
|
||||
responseType: 'stream',
|
||||
headers: {
|
||||
'User-Agent': WebScraper.USER_AGENT,
|
||||
},
|
||||
timeout: WebScraper.TIMEOUT,
|
||||
});
|
||||
|
||||
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
||||
const imagePath = join(assetsDir, filename);
|
||||
|
||||
const writer = createWriteStream(imagePath);
|
||||
response.data.pipe(writer);
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
writer.on('finish', resolve);
|
||||
writer.on('error', reject);
|
||||
});
|
||||
|
||||
// Get image metadata
|
||||
const metadata = await sharp(imagePath).metadata();
|
||||
|
||||
console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
|
||||
|
||||
return {
|
||||
url: imageUrl,
|
||||
filename,
|
||||
width: metadata.width,
|
||||
height: metadata.height,
|
||||
};
|
||||
|
||||
} catch (error: any) {
|
||||
console.error(`Error downloading image ${imageUrl}:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Create thumbnail from downloaded image
|
||||
async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
|
||||
try {
|
||||
await sharp(imagePath)
|
||||
.resize(size, size, {
|
||||
fit: 'cover',
|
||||
position: 'center',
|
||||
})
|
||||
.jpeg({ quality: 80 })
|
||||
.toFile(thumbnailPath);
|
||||
|
||||
return true;
|
||||
} catch (error: any) {
|
||||
console.error(`Error creating thumbnail:`, error.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch scraping with concurrency control - returns both successes and failures
|
||||
async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
|
||||
successes: ScrapedArticle[];
|
||||
failures: Array<{url: string; error: string}>;
|
||||
}> {
|
||||
const successes: ScrapedArticle[] = [];
|
||||
const failures: Array<{url: string; error: string}> = [];
|
||||
const chunks = this.chunkArray(urls, maxConcurrency);
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const promises = chunk.map(url =>
|
||||
this.scrapeArticle(url)
|
||||
.then(result => ({ url, result, error: null }))
|
||||
.catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
|
||||
);
|
||||
|
||||
const chunkResults = await Promise.all(promises);
|
||||
|
||||
// Separate successes and failures
|
||||
for (const { url, result, error } of chunkResults) {
|
||||
if (result) {
|
||||
successes.push(result);
|
||||
} else {
|
||||
failures.push({ url, error: error || 'Failed to scrape' });
|
||||
}
|
||||
}
|
||||
|
||||
// Small delay between batches to be respectful to servers
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
|
||||
return { successes, failures };
|
||||
}
|
||||
|
||||
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
|
||||
const chunks: T[][] = [];
|
||||
for (let i = 0; i < array.length; i += chunkSize) {
|
||||
chunks.push(array.slice(i, i + chunkSize));
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user