feat: SAPIENS Mobile App - Initial commit

React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-23 14:30:25 +09:00
commit 919afe56f2
1516 changed files with 64072 additions and 0 deletions
--- a/server/scraper.ts
+++ b/server/scraper.ts
@ -0,0 +1,445 @@
+import axios from 'axios';
+import * as cheerio from 'cheerio';
+import { createWriteStream, existsSync, mkdirSync } from 'fs';
+import { join } from 'path';
+import sharp from 'sharp';
+
+interface ScrapedArticle {
+  url: string;
+  title: string;
+  summary: string;
+  body: string;
+  imageUrl?: string;
+  publishedAt: Date;
+  author?: string;
+  tags: string[];
+}
+
+interface ScrapedImage {
+  url: string;
+  filename: string;
+  alt?: string;
+  width?: number;
+  height?: number;
+}
+
+export class WebScraper {
+  private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
+  private static readonly TIMEOUT = 30000; // 30 seconds
+  private static readonly MAX_RETRIES = 3;
+  
+  constructor() {
+    // Create assets directory if it doesn't exist
+    const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
+    if (!existsSync(assetsDir)) {
+      mkdirSync(assetsDir, { recursive: true });
+    }
+  }
+
+  // Main scraping method with retry logic
+  async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
+    try {
+      console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
+      
+      const response = await axios.get(url, {
+        headers: {
+          'User-Agent': WebScraper.USER_AGENT,
+          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+          'Accept-Language': 'en-US,en;q=0.5',
+          'Accept-Encoding': 'gzip, deflate, br',
+          'Connection': 'keep-alive',
+          'Upgrade-Insecure-Requests': '1',
+        },
+        timeout: WebScraper.TIMEOUT,
+        maxRedirects: 5,
+      });
+
+      const $ = cheerio.load(response.data);
+      
+      // Extract article data using multiple selectors for different news sites
+      const article = await this.extractArticleData($, url);
+      
+      if (!article) {
+        console.warn(`Failed to extract article data from ${url}`);
+        return null;
+      }
+
+      console.log(`Successfully scraped: ${article.title}`);
+      return article;
+      
+    } catch (error: any) {
+      console.error(`Error scraping ${url}:`, error.message);
+      
+      if (retryCount < WebScraper.MAX_RETRIES) {
+        console.log(`Retrying ${url} in 2 seconds...`);
+        await new Promise(resolve => setTimeout(resolve, 2000));
+        return this.scrapeArticle(url, retryCount + 1);
+      }
+      
+      return null;
+    }
+  }
+
+  // Extract article data with fallback selectors for different news sites
+  private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
+    const title = this.extractTitle($);
+    const summary = this.extractSummary($);
+    const body = this.extractBody($);
+    const imageUrl = this.extractMainImage($, url);
+    const publishedAt = this.extractPublishedDate($);
+    const author = this.extractAuthor($);
+    const tags = this.extractTags($);
+
+    if (!title || !body) {
+      console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
+      return null;
+    }
+
+    return {
+      url,
+      title,
+      summary: summary || (await this.generateSummaryFromBody(body, title)),
+      body,
+      imageUrl,
+      publishedAt: publishedAt || new Date(),
+      author,
+      tags,
+    };
+  }
+
+  private extractTitle($: cheerio.CheerioAPI): string {
+    // Try multiple common title selectors
+    const selectors = [
+      'h1.article-title',
+      'h1.entry-title',
+      'h1[class*="headline"]',
+      'h1[class*="title"]',
+      '.article-header h1',
+      '.post-title',
+      'h1',
+      'title',
+      '[property="og:title"]',
+    ];
+
+    for (const selector of selectors) {
+      const element = $(selector);
+      if (element.length && element.text().trim()) {
+        return element.first().text().trim();
+      }
+    }
+
+    return '';
+  }
+
+  private extractSummary($: cheerio.CheerioAPI): string {
+    const selectors = [
+      '.article-summary',
+      '.entry-summary',
+      '.article-excerpt',
+      '.post-excerpt',
+      '[class*="summary"]',
+      '[class*="excerpt"]',
+      '[property="og:description"]',
+      '[name="description"]',
+    ];
+
+    for (const selector of selectors) {
+      const element = $(selector);
+      if (element.length) {
+        const text = selector.includes('property') || selector.includes('name') 
+          ? element.attr('content') 
+          : element.text();
+        if (text && text.trim()) {
+          return text.trim();
+        }
+      }
+    }
+
+    return '';
+  }
+
+  private extractBody($: cheerio.CheerioAPI): string {
+    const selectors = [
+      '.article-content',
+      '.entry-content',
+      '.post-content',
+      '.article-body',
+      '[class*="content"]',
+      '.story-body',
+      '.article p',
+      '.post p',
+    ];
+
+    for (const selector of selectors) {
+      const elements = $(selector);
+      if (elements.length) {
+        // Get text from all paragraphs and join them
+        let bodyText = '';
+        elements.each((_, el) => {
+          const text = $(el).text().trim();
+          if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
+            bodyText += text + '\n\n';
+          }
+        });
+        
+        if (bodyText.length > 200) { // Ensure we have substantial content
+          return bodyText.trim();
+        }
+      }
+    }
+
+    return '';
+  }
+
+  private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
+    const selectors = [
+      '.article-image img',
+      '.featured-image img',
+      '[class*="hero"] img',
+      '.post-thumbnail img',
+      '[property="og:image"]',
+      'meta[property="og:image"]',
+      '.article img:first',
+      'img[class*="featured"]',
+    ];
+
+    for (const selector of selectors) {
+      const element = $(selector);
+      if (element.length) {
+        let src = selector.includes('property') || selector.includes('meta')
+          ? element.attr('content')
+          : element.attr('src');
+        
+        if (src) {
+          // Convert relative URLs to absolute
+          if (src.startsWith('//')) {
+            src = 'https:' + src;
+          } else if (src.startsWith('/')) {
+            const urlObj = new URL(baseUrl);
+            src = `${urlObj.origin}${src}`;
+          }
+          
+          if (src.startsWith('http')) {
+            return src;
+          }
+        }
+      }
+    }
+
+    return undefined;
+  }
+
+  private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
+    const selectors = [
+      '[property="article:published_time"]',
+      '[name="publish-date"]',
+      '.publish-date',
+      '.article-date',
+      '.entry-date',
+      'time[datetime]',
+      '[class*="date"]',
+    ];
+
+    for (const selector of selectors) {
+      const element = $(selector);
+      if (element.length) {
+        let dateStr = element.attr('content') || element.attr('datetime') || element.text();
+        
+        if (dateStr) {
+          const date = new Date(dateStr);
+          if (!isNaN(date.getTime())) {
+            return date;
+          }
+        }
+      }
+    }
+
+    return null;
+  }
+
+  private extractAuthor($: cheerio.CheerioAPI): string | undefined {
+    const selectors = [
+      '[rel="author"]',
+      '.author-name',
+      '.byline',
+      '[class*="author"]',
+      '[property="article:author"]',
+    ];
+
+    for (const selector of selectors) {
+      const element = $(selector);
+      if (element.length) {
+        const author = selector.includes('property') 
+          ? element.attr('content') 
+          : element.text();
+        if (author && author.trim()) {
+          return author.trim();
+        }
+      }
+    }
+
+    return undefined;
+  }
+
+  private extractTags($: cheerio.CheerioAPI): string[] {
+    const tags = new Set<string>();
+
+    // Extract from various tag selectors
+    const selectors = [
+      '.tags a',
+      '.tag-list a',
+      '[class*="tag"] a',
+      '.categories a',
+      '[rel="tag"]',
+    ];
+
+    for (const selector of selectors) {
+      $(selector).each((_, el) => {
+        const tag = $(el).text().trim();
+        if (tag) {
+          tags.add(tag);
+        }
+      });
+    }
+
+    return Array.from(tags);
+  }
+
+  private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
+    try {
+      // Try AI-powered summary generation first
+      const response = await axios.post('http://localhost:5000/api/generate-summary', {
+        content: body,
+        title: title || ''
+      }, {
+        timeout: 10000, // 10 second timeout
+        headers: {
+          'Content-Type': 'application/json'
+        }
+      });
+      
+      if (response.data?.summary) {
+        console.log('✅ AI-generated summary created');
+        return response.data.summary;
+      }
+    } catch (error) {
+      console.warn('⚠️  AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
+    }
+    
+    // Fallback to basic summary generation (improved version)
+    const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
+    const sentences = cleanedBody.split(/[.!?]+/);
+    let summary = '';
+    
+    for (const sentence of sentences) {
+      const trimmed = sentence.trim();
+      if (trimmed.length < 10) continue; // Skip very short sentences
+      if (summary.length + trimmed.length > 150) break;
+      summary += (summary ? '. ' : '') + trimmed;
+    }
+    
+    return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
+  }
+
+  // Download and process images
+  async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
+    try {
+      console.log(`Downloading image: ${imageUrl}`);
+      
+      const response = await axios.get(imageUrl, {
+        responseType: 'stream',
+        headers: {
+          'User-Agent': WebScraper.USER_AGENT,
+        },
+        timeout: WebScraper.TIMEOUT,
+      });
+
+      const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
+      const imagePath = join(assetsDir, filename);
+      
+      const writer = createWriteStream(imagePath);
+      response.data.pipe(writer);
+
+      await new Promise((resolve, reject) => {
+        writer.on('finish', resolve);
+        writer.on('error', reject);
+      });
+
+      // Get image metadata
+      const metadata = await sharp(imagePath).metadata();
+
+      console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
+
+      return {
+        url: imageUrl,
+        filename,
+        width: metadata.width,
+        height: metadata.height,
+      };
+
+    } catch (error: any) {
+      console.error(`Error downloading image ${imageUrl}:`, error.message);
+      return null;
+    }
+  }
+
+  // Create thumbnail from downloaded image
+  async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
+    try {
+      await sharp(imagePath)
+        .resize(size, size, {
+          fit: 'cover',
+          position: 'center',
+        })
+        .jpeg({ quality: 80 })
+        .toFile(thumbnailPath);
+
+      return true;
+    } catch (error: any) {
+      console.error(`Error creating thumbnail:`, error.message);
+      return false;
+    }
+  }
+
+  // Batch scraping with concurrency control - returns both successes and failures
+  async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
+    successes: ScrapedArticle[];
+    failures: Array<{url: string; error: string}>;
+  }> {
+    const successes: ScrapedArticle[] = [];
+    const failures: Array<{url: string; error: string}> = [];
+    const chunks = this.chunkArray(urls, maxConcurrency);
+
+    for (const chunk of chunks) {
+      const promises = chunk.map(url => 
+        this.scrapeArticle(url)
+          .then(result => ({ url, result, error: null }))
+          .catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
+      );
+      
+      const chunkResults = await Promise.all(promises);
+      
+      // Separate successes and failures
+      for (const { url, result, error } of chunkResults) {
+        if (result) {
+          successes.push(result);
+        } else {
+          failures.push({ url, error: error || 'Failed to scrape' });
+        }
+      }
+      
+      // Small delay between batches to be respectful to servers
+      await new Promise(resolve => setTimeout(resolve, 1000));
+    }
+
+    return { successes, failures };
+  }
+
+  private chunkArray<T>(array: T[], chunkSize: number): T[][] {
+    const chunks: T[][] = [];
+    for (let i = 0; i < array.length; i += chunkSize) {
+      chunks.push(array.slice(i, i + chunkSize));
+    }
+    return chunks;
+  }
+}