React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
445 lines
12 KiB
TypeScript
445 lines
12 KiB
TypeScript
import axios from 'axios';
|
|
import * as cheerio from 'cheerio';
|
|
import { createWriteStream, existsSync, mkdirSync } from 'fs';
|
|
import { join } from 'path';
|
|
import sharp from 'sharp';
|
|
|
|
interface ScrapedArticle {
|
|
url: string;
|
|
title: string;
|
|
summary: string;
|
|
body: string;
|
|
imageUrl?: string;
|
|
publishedAt: Date;
|
|
author?: string;
|
|
tags: string[];
|
|
}
|
|
|
|
interface ScrapedImage {
|
|
url: string;
|
|
filename: string;
|
|
alt?: string;
|
|
width?: number;
|
|
height?: number;
|
|
}
|
|
|
|
export class WebScraper {
|
|
private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
|
|
private static readonly TIMEOUT = 30000; // 30 seconds
|
|
private static readonly MAX_RETRIES = 3;
|
|
|
|
constructor() {
|
|
// Create assets directory if it doesn't exist
|
|
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
|
if (!existsSync(assetsDir)) {
|
|
mkdirSync(assetsDir, { recursive: true });
|
|
}
|
|
}
|
|
|
|
// Main scraping method with retry logic
|
|
async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
|
|
try {
|
|
console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
|
|
|
|
const response = await axios.get(url, {
|
|
headers: {
|
|
'User-Agent': WebScraper.USER_AGENT,
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
},
|
|
timeout: WebScraper.TIMEOUT,
|
|
maxRedirects: 5,
|
|
});
|
|
|
|
const $ = cheerio.load(response.data);
|
|
|
|
// Extract article data using multiple selectors for different news sites
|
|
const article = await this.extractArticleData($, url);
|
|
|
|
if (!article) {
|
|
console.warn(`Failed to extract article data from ${url}`);
|
|
return null;
|
|
}
|
|
|
|
console.log(`Successfully scraped: ${article.title}`);
|
|
return article;
|
|
|
|
} catch (error: any) {
|
|
console.error(`Error scraping ${url}:`, error.message);
|
|
|
|
if (retryCount < WebScraper.MAX_RETRIES) {
|
|
console.log(`Retrying ${url} in 2 seconds...`);
|
|
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
return this.scrapeArticle(url, retryCount + 1);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// Extract article data with fallback selectors for different news sites
|
|
private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
|
|
const title = this.extractTitle($);
|
|
const summary = this.extractSummary($);
|
|
const body = this.extractBody($);
|
|
const imageUrl = this.extractMainImage($, url);
|
|
const publishedAt = this.extractPublishedDate($);
|
|
const author = this.extractAuthor($);
|
|
const tags = this.extractTags($);
|
|
|
|
if (!title || !body) {
|
|
console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
url,
|
|
title,
|
|
summary: summary || (await this.generateSummaryFromBody(body, title)),
|
|
body,
|
|
imageUrl,
|
|
publishedAt: publishedAt || new Date(),
|
|
author,
|
|
tags,
|
|
};
|
|
}
|
|
|
|
private extractTitle($: cheerio.CheerioAPI): string {
|
|
// Try multiple common title selectors
|
|
const selectors = [
|
|
'h1.article-title',
|
|
'h1.entry-title',
|
|
'h1[class*="headline"]',
|
|
'h1[class*="title"]',
|
|
'.article-header h1',
|
|
'.post-title',
|
|
'h1',
|
|
'title',
|
|
'[property="og:title"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const element = $(selector);
|
|
if (element.length && element.text().trim()) {
|
|
return element.first().text().trim();
|
|
}
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
private extractSummary($: cheerio.CheerioAPI): string {
|
|
const selectors = [
|
|
'.article-summary',
|
|
'.entry-summary',
|
|
'.article-excerpt',
|
|
'.post-excerpt',
|
|
'[class*="summary"]',
|
|
'[class*="excerpt"]',
|
|
'[property="og:description"]',
|
|
'[name="description"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const element = $(selector);
|
|
if (element.length) {
|
|
const text = selector.includes('property') || selector.includes('name')
|
|
? element.attr('content')
|
|
: element.text();
|
|
if (text && text.trim()) {
|
|
return text.trim();
|
|
}
|
|
}
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
private extractBody($: cheerio.CheerioAPI): string {
|
|
const selectors = [
|
|
'.article-content',
|
|
'.entry-content',
|
|
'.post-content',
|
|
'.article-body',
|
|
'[class*="content"]',
|
|
'.story-body',
|
|
'.article p',
|
|
'.post p',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = $(selector);
|
|
if (elements.length) {
|
|
// Get text from all paragraphs and join them
|
|
let bodyText = '';
|
|
elements.each((_, el) => {
|
|
const text = $(el).text().trim();
|
|
if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
|
|
bodyText += text + '\n\n';
|
|
}
|
|
});
|
|
|
|
if (bodyText.length > 200) { // Ensure we have substantial content
|
|
return bodyText.trim();
|
|
}
|
|
}
|
|
}
|
|
|
|
return '';
|
|
}
|
|
|
|
private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
|
|
const selectors = [
|
|
'.article-image img',
|
|
'.featured-image img',
|
|
'[class*="hero"] img',
|
|
'.post-thumbnail img',
|
|
'[property="og:image"]',
|
|
'meta[property="og:image"]',
|
|
'.article img:first',
|
|
'img[class*="featured"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const element = $(selector);
|
|
if (element.length) {
|
|
let src = selector.includes('property') || selector.includes('meta')
|
|
? element.attr('content')
|
|
: element.attr('src');
|
|
|
|
if (src) {
|
|
// Convert relative URLs to absolute
|
|
if (src.startsWith('//')) {
|
|
src = 'https:' + src;
|
|
} else if (src.startsWith('/')) {
|
|
const urlObj = new URL(baseUrl);
|
|
src = `${urlObj.origin}${src}`;
|
|
}
|
|
|
|
if (src.startsWith('http')) {
|
|
return src;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
|
|
const selectors = [
|
|
'[property="article:published_time"]',
|
|
'[name="publish-date"]',
|
|
'.publish-date',
|
|
'.article-date',
|
|
'.entry-date',
|
|
'time[datetime]',
|
|
'[class*="date"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const element = $(selector);
|
|
if (element.length) {
|
|
let dateStr = element.attr('content') || element.attr('datetime') || element.text();
|
|
|
|
if (dateStr) {
|
|
const date = new Date(dateStr);
|
|
if (!isNaN(date.getTime())) {
|
|
return date;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
|
|
const selectors = [
|
|
'[rel="author"]',
|
|
'.author-name',
|
|
'.byline',
|
|
'[class*="author"]',
|
|
'[property="article:author"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const element = $(selector);
|
|
if (element.length) {
|
|
const author = selector.includes('property')
|
|
? element.attr('content')
|
|
: element.text();
|
|
if (author && author.trim()) {
|
|
return author.trim();
|
|
}
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
private extractTags($: cheerio.CheerioAPI): string[] {
|
|
const tags = new Set<string>();
|
|
|
|
// Extract from various tag selectors
|
|
const selectors = [
|
|
'.tags a',
|
|
'.tag-list a',
|
|
'[class*="tag"] a',
|
|
'.categories a',
|
|
'[rel="tag"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
$(selector).each((_, el) => {
|
|
const tag = $(el).text().trim();
|
|
if (tag) {
|
|
tags.add(tag);
|
|
}
|
|
});
|
|
}
|
|
|
|
return Array.from(tags);
|
|
}
|
|
|
|
private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
|
|
try {
|
|
// Try AI-powered summary generation first
|
|
const response = await axios.post('http://localhost:5000/api/generate-summary', {
|
|
content: body,
|
|
title: title || ''
|
|
}, {
|
|
timeout: 10000, // 10 second timeout
|
|
headers: {
|
|
'Content-Type': 'application/json'
|
|
}
|
|
});
|
|
|
|
if (response.data?.summary) {
|
|
console.log('✅ AI-generated summary created');
|
|
return response.data.summary;
|
|
}
|
|
} catch (error) {
|
|
console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
|
|
}
|
|
|
|
// Fallback to basic summary generation (improved version)
|
|
const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
|
|
const sentences = cleanedBody.split(/[.!?]+/);
|
|
let summary = '';
|
|
|
|
for (const sentence of sentences) {
|
|
const trimmed = sentence.trim();
|
|
if (trimmed.length < 10) continue; // Skip very short sentences
|
|
if (summary.length + trimmed.length > 150) break;
|
|
summary += (summary ? '. ' : '') + trimmed;
|
|
}
|
|
|
|
return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
|
|
}
|
|
|
|
// Download and process images
|
|
async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
|
|
try {
|
|
console.log(`Downloading image: ${imageUrl}`);
|
|
|
|
const response = await axios.get(imageUrl, {
|
|
responseType: 'stream',
|
|
headers: {
|
|
'User-Agent': WebScraper.USER_AGENT,
|
|
},
|
|
timeout: WebScraper.TIMEOUT,
|
|
});
|
|
|
|
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
|
|
const imagePath = join(assetsDir, filename);
|
|
|
|
const writer = createWriteStream(imagePath);
|
|
response.data.pipe(writer);
|
|
|
|
await new Promise((resolve, reject) => {
|
|
writer.on('finish', resolve);
|
|
writer.on('error', reject);
|
|
});
|
|
|
|
// Get image metadata
|
|
const metadata = await sharp(imagePath).metadata();
|
|
|
|
console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
|
|
|
|
return {
|
|
url: imageUrl,
|
|
filename,
|
|
width: metadata.width,
|
|
height: metadata.height,
|
|
};
|
|
|
|
} catch (error: any) {
|
|
console.error(`Error downloading image ${imageUrl}:`, error.message);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
// Create thumbnail from downloaded image
|
|
async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
|
|
try {
|
|
await sharp(imagePath)
|
|
.resize(size, size, {
|
|
fit: 'cover',
|
|
position: 'center',
|
|
})
|
|
.jpeg({ quality: 80 })
|
|
.toFile(thumbnailPath);
|
|
|
|
return true;
|
|
} catch (error: any) {
|
|
console.error(`Error creating thumbnail:`, error.message);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Batch scraping with concurrency control - returns both successes and failures
|
|
async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
|
|
successes: ScrapedArticle[];
|
|
failures: Array<{url: string; error: string}>;
|
|
}> {
|
|
const successes: ScrapedArticle[] = [];
|
|
const failures: Array<{url: string; error: string}> = [];
|
|
const chunks = this.chunkArray(urls, maxConcurrency);
|
|
|
|
for (const chunk of chunks) {
|
|
const promises = chunk.map(url =>
|
|
this.scrapeArticle(url)
|
|
.then(result => ({ url, result, error: null }))
|
|
.catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
|
|
);
|
|
|
|
const chunkResults = await Promise.all(promises);
|
|
|
|
// Separate successes and failures
|
|
for (const { url, result, error } of chunkResults) {
|
|
if (result) {
|
|
successes.push(result);
|
|
} else {
|
|
failures.push({ url, error: error || 'Failed to scrape' });
|
|
}
|
|
}
|
|
|
|
// Small delay between batches to be respectful to servers
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
}
|
|
|
|
return { successes, failures };
|
|
}
|
|
|
|
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
|
|
const chunks: T[][] = [];
|
|
for (let i = 0; i < array.length; i += chunkSize) {
|
|
chunks.push(array.slice(i, i + chunkSize));
|
|
}
|
|
return chunks;
|
|
}
|
|
} |