import axios from 'axios'; import * as cheerio from 'cheerio'; import { createWriteStream, existsSync, mkdirSync } from 'fs'; import { join } from 'path'; import sharp from 'sharp'; interface ScrapedArticle { url: string; title: string; summary: string; body: string; imageUrl?: string; publishedAt: Date; author?: string; tags: string[]; } interface ScrapedImage { url: string; filename: string; alt?: string; width?: number; height?: number; } export class WebScraper { private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'; private static readonly TIMEOUT = 30000; // 30 seconds private static readonly MAX_RETRIES = 3; constructor() { // Create assets directory if it doesn't exist const assetsDir = join(process.cwd(), 'attached_assets', 'scraped'); if (!existsSync(assetsDir)) { mkdirSync(assetsDir, { recursive: true }); } } // Main scraping method with retry logic async scrapeArticle(url: string, retryCount = 0): Promise { try { console.log(`Scraping ${url} (attempt ${retryCount + 1})`); const response = await axios.get(url, { headers: { 'User-Agent': WebScraper.USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', }, timeout: WebScraper.TIMEOUT, maxRedirects: 5, }); const $ = cheerio.load(response.data); // Extract article data using multiple selectors for different news sites const article = await this.extractArticleData($, url); if (!article) { console.warn(`Failed to extract article data from ${url}`); return null; } console.log(`Successfully scraped: ${article.title}`); return article; } catch (error: any) { console.error(`Error scraping ${url}:`, error.message); if (retryCount < WebScraper.MAX_RETRIES) { console.log(`Retrying ${url} in 2 seconds...`); await new Promise(resolve => setTimeout(resolve, 2000)); return this.scrapeArticle(url, retryCount + 1); } return null; } } // Extract article data with fallback selectors for different news sites private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise { const title = this.extractTitle($); const summary = this.extractSummary($); const body = this.extractBody($); const imageUrl = this.extractMainImage($, url); const publishedAt = this.extractPublishedDate($); const author = this.extractAuthor($); const tags = this.extractTags($); if (!title || !body) { console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`); return null; } return { url, title, summary: summary || (await this.generateSummaryFromBody(body, title)), body, imageUrl, publishedAt: publishedAt || new Date(), author, tags, }; } private extractTitle($: cheerio.CheerioAPI): string { // Try multiple common title selectors const selectors = [ 'h1.article-title', 'h1.entry-title', 'h1[class*="headline"]', 'h1[class*="title"]', '.article-header h1', '.post-title', 'h1', 'title', '[property="og:title"]', ]; for (const selector of selectors) { const element = $(selector); if (element.length && element.text().trim()) { return element.first().text().trim(); } } return ''; } private extractSummary($: cheerio.CheerioAPI): string { const selectors = [ '.article-summary', '.entry-summary', '.article-excerpt', '.post-excerpt', '[class*="summary"]', '[class*="excerpt"]', '[property="og:description"]', '[name="description"]', ]; for (const selector of selectors) { const element = $(selector); if (element.length) { const text = selector.includes('property') || selector.includes('name') ? element.attr('content') : element.text(); if (text && text.trim()) { return text.trim(); } } } return ''; } private extractBody($: cheerio.CheerioAPI): string { const selectors = [ '.article-content', '.entry-content', '.post-content', '.article-body', '[class*="content"]', '.story-body', '.article p', '.post p', ]; for (const selector of selectors) { const elements = $(selector); if (elements.length) { // Get text from all paragraphs and join them let bodyText = ''; elements.each((_, el) => { const text = $(el).text().trim(); if (text && text.length > 50) { // Skip short elements (ads, captions, etc.) bodyText += text + '\n\n'; } }); if (bodyText.length > 200) { // Ensure we have substantial content return bodyText.trim(); } } } return ''; } private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined { const selectors = [ '.article-image img', '.featured-image img', '[class*="hero"] img', '.post-thumbnail img', '[property="og:image"]', 'meta[property="og:image"]', '.article img:first', 'img[class*="featured"]', ]; for (const selector of selectors) { const element = $(selector); if (element.length) { let src = selector.includes('property') || selector.includes('meta') ? element.attr('content') : element.attr('src'); if (src) { // Convert relative URLs to absolute if (src.startsWith('//')) { src = 'https:' + src; } else if (src.startsWith('/')) { const urlObj = new URL(baseUrl); src = `${urlObj.origin}${src}`; } if (src.startsWith('http')) { return src; } } } } return undefined; } private extractPublishedDate($: cheerio.CheerioAPI): Date | null { const selectors = [ '[property="article:published_time"]', '[name="publish-date"]', '.publish-date', '.article-date', '.entry-date', 'time[datetime]', '[class*="date"]', ]; for (const selector of selectors) { const element = $(selector); if (element.length) { let dateStr = element.attr('content') || element.attr('datetime') || element.text(); if (dateStr) { const date = new Date(dateStr); if (!isNaN(date.getTime())) { return date; } } } } return null; } private extractAuthor($: cheerio.CheerioAPI): string | undefined { const selectors = [ '[rel="author"]', '.author-name', '.byline', '[class*="author"]', '[property="article:author"]', ]; for (const selector of selectors) { const element = $(selector); if (element.length) { const author = selector.includes('property') ? element.attr('content') : element.text(); if (author && author.trim()) { return author.trim(); } } } return undefined; } private extractTags($: cheerio.CheerioAPI): string[] { const tags = new Set(); // Extract from various tag selectors const selectors = [ '.tags a', '.tag-list a', '[class*="tag"] a', '.categories a', '[rel="tag"]', ]; for (const selector of selectors) { $(selector).each((_, el) => { const tag = $(el).text().trim(); if (tag) { tags.add(tag); } }); } return Array.from(tags); } private async generateSummaryFromBody(body: string, title?: string): Promise { try { // Try AI-powered summary generation first const response = await axios.post('http://localhost:5000/api/generate-summary', { content: body, title: title || '' }, { timeout: 10000, // 10 second timeout headers: { 'Content-Type': 'application/json' } }); if (response.data?.summary) { console.log('✅ AI-generated summary created'); return response.data.summary; } } catch (error) { console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error'); } // Fallback to basic summary generation (improved version) const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim(); const sentences = cleanedBody.split(/[.!?]+/); let summary = ''; for (const sentence of sentences) { const trimmed = sentence.trim(); if (trimmed.length < 10) continue; // Skip very short sentences if (summary.length + trimmed.length > 150) break; summary += (summary ? '. ' : '') + trimmed; } return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150); } // Download and process images async downloadImage(imageUrl: string, filename: string): Promise { try { console.log(`Downloading image: ${imageUrl}`); const response = await axios.get(imageUrl, { responseType: 'stream', headers: { 'User-Agent': WebScraper.USER_AGENT, }, timeout: WebScraper.TIMEOUT, }); const assetsDir = join(process.cwd(), 'attached_assets', 'scraped'); const imagePath = join(assetsDir, filename); const writer = createWriteStream(imagePath); response.data.pipe(writer); await new Promise((resolve, reject) => { writer.on('finish', resolve); writer.on('error', reject); }); // Get image metadata const metadata = await sharp(imagePath).metadata(); console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`); return { url: imageUrl, filename, width: metadata.width, height: metadata.height, }; } catch (error: any) { console.error(`Error downloading image ${imageUrl}:`, error.message); return null; } } // Create thumbnail from downloaded image async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise { try { await sharp(imagePath) .resize(size, size, { fit: 'cover', position: 'center', }) .jpeg({ quality: 80 }) .toFile(thumbnailPath); return true; } catch (error: any) { console.error(`Error creating thumbnail:`, error.message); return false; } } // Batch scraping with concurrency control - returns both successes and failures async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{ successes: ScrapedArticle[]; failures: Array<{url: string; error: string}>; }> { const successes: ScrapedArticle[] = []; const failures: Array<{url: string; error: string}> = []; const chunks = this.chunkArray(urls, maxConcurrency); for (const chunk of chunks) { const promises = chunk.map(url => this.scrapeArticle(url) .then(result => ({ url, result, error: null })) .catch(error => ({ url, result: null, error: error.message || 'Unknown error' })) ); const chunkResults = await Promise.all(promises); // Separate successes and failures for (const { url, result, error } of chunkResults) { if (result) { successes.push(result); } else { failures.push({ url, error: error || 'Failed to scrape' }); } } // Small delay between batches to be respectful to servers await new Promise(resolve => setTimeout(resolve, 1000)); } return { successes, failures }; } private chunkArray(array: T[], chunkSize: number): T[][] { const chunks: T[][] = []; for (let i = 0; i < array.length; i += chunkSize) { chunks.push(array.slice(i, i + chunkSize)); } return chunks; } }