sapiens-mobile/server/scraper.ts

import axios from 'axios';
import * as cheerio from 'cheerio';
import { createWriteStream, existsSync, mkdirSync } from 'fs';
import { join } from 'path';
import sharp from 'sharp';

interface ScrapedArticle {
  url: string;
  title: string;
  summary: string;
  body: string;
  imageUrl?: string;
  publishedAt: Date;
  author?: string;
  tags: string[];
}

interface ScrapedImage {
  url: string;
  filename: string;
  alt?: string;
  width?: number;
  height?: number;
}

export class WebScraper {
  private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
  private static readonly TIMEOUT = 30000; // 30 seconds
  private static readonly MAX_RETRIES = 3;

  constructor() {
    // Create assets directory if it doesn't exist
    const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
    if (!existsSync(assetsDir)) {
      mkdirSync(assetsDir, { recursive: true });
    }
  }

  // Main scraping method with retry logic
  async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
    try {
      console.log(`Scraping ${url} (attempt ${retryCount + 1})`);

      const response = await axios.get(url, {
        headers: {
          'User-Agent': WebScraper.USER_AGENT,
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
          'Accept-Language': 'en-US,en;q=0.5',
          'Accept-Encoding': 'gzip, deflate, br',
          'Connection': 'keep-alive',
          'Upgrade-Insecure-Requests': '1',
        },
        timeout: WebScraper.TIMEOUT,
        maxRedirects: 5,
      });

      const $ = cheerio.load(response.data);

      // Extract article data using multiple selectors for different news sites
      const article = await this.extractArticleData($, url);

      if (!article) {
        console.warn(`Failed to extract article data from ${url}`);
        return null;
      }

      console.log(`Successfully scraped: ${article.title}`);
      return article;

    } catch (error: any) {
      console.error(`Error scraping ${url}:`, error.message);

      if (retryCount < WebScraper.MAX_RETRIES) {
        console.log(`Retrying ${url} in 2 seconds...`);
        await new Promise(resolve => setTimeout(resolve, 2000));
        return this.scrapeArticle(url, retryCount + 1);
      }

      return null;
    }
  }

  // Extract article data with fallback selectors for different news sites
  private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
    const title = this.extractTitle($);
    const summary = this.extractSummary($);
    const body = this.extractBody($);
    const imageUrl = this.extractMainImage($, url);
    const publishedAt = this.extractPublishedDate($);
    const author = this.extractAuthor($);
    const tags = this.extractTags($);

    if (!title || !body) {
      console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
      return null;
    }

    return {
      url,
      title,
      summary: summary || (await this.generateSummaryFromBody(body, title)),
      body,
      imageUrl,
      publishedAt: publishedAt || new Date(),
      author,
      tags,
    };
  }

  private extractTitle($: cheerio.CheerioAPI): string {
    // Try multiple common title selectors
    const selectors = [
      'h1.article-title',
      'h1.entry-title',
      'h1[class*="headline"]',
      'h1[class*="title"]',
      '.article-header h1',
      '.post-title',
      'h1',
      'title',
      '[property="og:title"]',
    ];

    for (const selector of selectors) {
      const element = $(selector);
      if (element.length && element.text().trim()) {
        return element.first().text().trim();
      }
    }

    return '';
  }

  private extractSummary($: cheerio.CheerioAPI): string {
    const selectors = [
      '.article-summary',
      '.entry-summary',
      '.article-excerpt',
      '.post-excerpt',
      '[class*="summary"]',
      '[class*="excerpt"]',
      '[property="og:description"]',
      '[name="description"]',
    ];

    for (const selector of selectors) {
      const element = $(selector);
      if (element.length) {
        const text = selector.includes('property') || selector.includes('name')
          ? element.attr('content')
          : element.text();
        if (text && text.trim()) {
          return text.trim();
        }
      }
    }

    return '';
  }

  private extractBody($: cheerio.CheerioAPI): string {
    const selectors = [
      '.article-content',
      '.entry-content',
      '.post-content',
      '.article-body',
      '[class*="content"]',
      '.story-body',
      '.article p',
      '.post p',
    ];

    for (const selector of selectors) {
      const elements = $(selector);
      if (elements.length) {
        // Get text from all paragraphs and join them
        let bodyText = '';
        elements.each((_, el) => {
          const text = $(el).text().trim();
          if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
            bodyText += text + '\n\n';
          }
        });

        if (bodyText.length > 200) { // Ensure we have substantial content
          return bodyText.trim();
        }
      }
    }

    return '';
  }

  private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
    const selectors = [
      '.article-image img',
      '.featured-image img',
      '[class*="hero"] img',
      '.post-thumbnail img',
      '[property="og:image"]',
      'meta[property="og:image"]',
      '.article img:first',
      'img[class*="featured"]',
    ];

    for (const selector of selectors) {
      const element = $(selector);
      if (element.length) {
        let src = selector.includes('property') || selector.includes('meta')
          ? element.attr('content')
          : element.attr('src');

        if (src) {
          // Convert relative URLs to absolute
          if (src.startsWith('//')) {
            src = 'https:' + src;
          } else if (src.startsWith('/')) {
            const urlObj = new URL(baseUrl);
            src = `${urlObj.origin}${src}`;
          }

          if (src.startsWith('http')) {
            return src;
          }
        }
      }
    }

    return undefined;
  }

  private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
    const selectors = [
      '[property="article:published_time"]',
      '[name="publish-date"]',
      '.publish-date',
      '.article-date',
      '.entry-date',
      'time[datetime]',
      '[class*="date"]',
    ];

    for (const selector of selectors) {
      const element = $(selector);
      if (element.length) {
        let dateStr = element.attr('content') || element.attr('datetime') || element.text();

        if (dateStr) {
          const date = new Date(dateStr);
          if (!isNaN(date.getTime())) {
            return date;
          }
        }
      }
    }

    return null;
  }

  private extractAuthor($: cheerio.CheerioAPI): string | undefined {
    const selectors = [
      '[rel="author"]',
      '.author-name',
      '.byline',
      '[class*="author"]',
      '[property="article:author"]',
    ];

    for (const selector of selectors) {
      const element = $(selector);
      if (element.length) {
        const author = selector.includes('property')
          ? element.attr('content')
          : element.text();
        if (author && author.trim()) {
          return author.trim();
        }
      }
    }

    return undefined;
  }

  private extractTags($: cheerio.CheerioAPI): string[] {
    const tags = new Set<string>();

    // Extract from various tag selectors
    const selectors = [
      '.tags a',
      '.tag-list a',
      '[class*="tag"] a',
      '.categories a',
      '[rel="tag"]',
    ];

    for (const selector of selectors) {
      $(selector).each((_, el) => {
        const tag = $(el).text().trim();
        if (tag) {
          tags.add(tag);
        }
      });
    }

    return Array.from(tags);
  }

  private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
    try {
      // Try AI-powered summary generation first
      const response = await axios.post('http://localhost:5000/api/generate-summary', {
        content: body,
        title: title || ''
      }, {
        timeout: 10000, // 10 second timeout
        headers: {
          'Content-Type': 'application/json'
        }
      });

      if (response.data?.summary) {
        console.log('✅ AI-generated summary created');
        return response.data.summary;
      }
    } catch (error) {
      console.warn('⚠️  AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
    }

    // Fallback to basic summary generation (improved version)
    const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
    const sentences = cleanedBody.split(/[.!?]+/);
    let summary = '';

    for (const sentence of sentences) {
      const trimmed = sentence.trim();
      if (trimmed.length < 10) continue; // Skip very short sentences
      if (summary.length + trimmed.length > 150) break;
      summary += (summary ? '. ' : '') + trimmed;
    }

    return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
  }

  // Download and process images
  async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
    try {
      console.log(`Downloading image: ${imageUrl}`);

      const response = await axios.get(imageUrl, {
        responseType: 'stream',
        headers: {
          'User-Agent': WebScraper.USER_AGENT,
        },
        timeout: WebScraper.TIMEOUT,
      });

      const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
      const imagePath = join(assetsDir, filename);

      const writer = createWriteStream(imagePath);
      response.data.pipe(writer);

      await new Promise((resolve, reject) => {
        writer.on('finish', resolve);
        writer.on('error', reject);
      });

      // Get image metadata
      const metadata = await sharp(imagePath).metadata();

      console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);

      return {
        url: imageUrl,
        filename,
        width: metadata.width,
        height: metadata.height,
      };

    } catch (error: any) {
      console.error(`Error downloading image ${imageUrl}:`, error.message);
      return null;
    }
  }

  // Create thumbnail from downloaded image
  async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
    try {
      await sharp(imagePath)
        .resize(size, size, {
          fit: 'cover',
          position: 'center',
        })
        .jpeg({ quality: 80 })
        .toFile(thumbnailPath);

      return true;
    } catch (error: any) {
      console.error(`Error creating thumbnail:`, error.message);
      return false;
    }
  }

  // Batch scraping with concurrency control - returns both successes and failures
  async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
    successes: ScrapedArticle[];
    failures: Array<{url: string; error: string}>;
  }> {
    const successes: ScrapedArticle[] = [];
    const failures: Array<{url: string; error: string}> = [];
    const chunks = this.chunkArray(urls, maxConcurrency);

    for (const chunk of chunks) {
      const promises = chunk.map(url =>
        this.scrapeArticle(url)
          .then(result => ({ url, result, error: null }))
          .catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
      );

      const chunkResults = await Promise.all(promises);

      // Separate successes and failures
      for (const { url, result, error } of chunkResults) {
        if (result) {
          successes.push(result);
        } else {
          failures.push({ url, error: error || 'Failed to scrape' });
        }
      }

      // Small delay between batches to be respectful to servers
      await new Promise(resolve => setTimeout(resolve, 1000));
    }

    return { successes, failures };
  }

  private chunkArray<T>(array: T[], chunkSize: number): T[][] {
    const chunks: T[][] = [];
    for (let i = 0; i < array.length; i += chunkSize) {
      chunks.push(array.slice(i, i + chunkSize));
    }
    return chunks;
  }
}