Files
sapiens-mobile/server/scraper.ts
jungwoo choi 919afe56f2 feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform.
Consolidated all previous history into single commit.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-23 14:30:25 +09:00

445 lines
12 KiB
TypeScript

import axios from 'axios';
import * as cheerio from 'cheerio';
import { createWriteStream, existsSync, mkdirSync } from 'fs';
import { join } from 'path';
import sharp from 'sharp';
interface ScrapedArticle {
url: string;
title: string;
summary: string;
body: string;
imageUrl?: string;
publishedAt: Date;
author?: string;
tags: string[];
}
interface ScrapedImage {
url: string;
filename: string;
alt?: string;
width?: number;
height?: number;
}
export class WebScraper {
private static readonly USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
private static readonly TIMEOUT = 30000; // 30 seconds
private static readonly MAX_RETRIES = 3;
constructor() {
// Create assets directory if it doesn't exist
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
if (!existsSync(assetsDir)) {
mkdirSync(assetsDir, { recursive: true });
}
}
// Main scraping method with retry logic
async scrapeArticle(url: string, retryCount = 0): Promise<ScrapedArticle | null> {
try {
console.log(`Scraping ${url} (attempt ${retryCount + 1})`);
const response = await axios.get(url, {
headers: {
'User-Agent': WebScraper.USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
timeout: WebScraper.TIMEOUT,
maxRedirects: 5,
});
const $ = cheerio.load(response.data);
// Extract article data using multiple selectors for different news sites
const article = await this.extractArticleData($, url);
if (!article) {
console.warn(`Failed to extract article data from ${url}`);
return null;
}
console.log(`Successfully scraped: ${article.title}`);
return article;
} catch (error: any) {
console.error(`Error scraping ${url}:`, error.message);
if (retryCount < WebScraper.MAX_RETRIES) {
console.log(`Retrying ${url} in 2 seconds...`);
await new Promise(resolve => setTimeout(resolve, 2000));
return this.scrapeArticle(url, retryCount + 1);
}
return null;
}
}
// Extract article data with fallback selectors for different news sites
private async extractArticleData($: cheerio.CheerioAPI, url: string): Promise<ScrapedArticle | null> {
const title = this.extractTitle($);
const summary = this.extractSummary($);
const body = this.extractBody($);
const imageUrl = this.extractMainImage($, url);
const publishedAt = this.extractPublishedDate($);
const author = this.extractAuthor($);
const tags = this.extractTags($);
if (!title || !body) {
console.warn(`Missing essential data for ${url}. Title: ${!!title}, Body: ${!!body}`);
return null;
}
return {
url,
title,
summary: summary || (await this.generateSummaryFromBody(body, title)),
body,
imageUrl,
publishedAt: publishedAt || new Date(),
author,
tags,
};
}
private extractTitle($: cheerio.CheerioAPI): string {
// Try multiple common title selectors
const selectors = [
'h1.article-title',
'h1.entry-title',
'h1[class*="headline"]',
'h1[class*="title"]',
'.article-header h1',
'.post-title',
'h1',
'title',
'[property="og:title"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length && element.text().trim()) {
return element.first().text().trim();
}
}
return '';
}
private extractSummary($: cheerio.CheerioAPI): string {
const selectors = [
'.article-summary',
'.entry-summary',
'.article-excerpt',
'.post-excerpt',
'[class*="summary"]',
'[class*="excerpt"]',
'[property="og:description"]',
'[name="description"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
const text = selector.includes('property') || selector.includes('name')
? element.attr('content')
: element.text();
if (text && text.trim()) {
return text.trim();
}
}
}
return '';
}
private extractBody($: cheerio.CheerioAPI): string {
const selectors = [
'.article-content',
'.entry-content',
'.post-content',
'.article-body',
'[class*="content"]',
'.story-body',
'.article p',
'.post p',
];
for (const selector of selectors) {
const elements = $(selector);
if (elements.length) {
// Get text from all paragraphs and join them
let bodyText = '';
elements.each((_, el) => {
const text = $(el).text().trim();
if (text && text.length > 50) { // Skip short elements (ads, captions, etc.)
bodyText += text + '\n\n';
}
});
if (bodyText.length > 200) { // Ensure we have substantial content
return bodyText.trim();
}
}
}
return '';
}
private extractMainImage($: cheerio.CheerioAPI, baseUrl: string): string | undefined {
const selectors = [
'.article-image img',
'.featured-image img',
'[class*="hero"] img',
'.post-thumbnail img',
'[property="og:image"]',
'meta[property="og:image"]',
'.article img:first',
'img[class*="featured"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
let src = selector.includes('property') || selector.includes('meta')
? element.attr('content')
: element.attr('src');
if (src) {
// Convert relative URLs to absolute
if (src.startsWith('//')) {
src = 'https:' + src;
} else if (src.startsWith('/')) {
const urlObj = new URL(baseUrl);
src = `${urlObj.origin}${src}`;
}
if (src.startsWith('http')) {
return src;
}
}
}
}
return undefined;
}
private extractPublishedDate($: cheerio.CheerioAPI): Date | null {
const selectors = [
'[property="article:published_time"]',
'[name="publish-date"]',
'.publish-date',
'.article-date',
'.entry-date',
'time[datetime]',
'[class*="date"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
let dateStr = element.attr('content') || element.attr('datetime') || element.text();
if (dateStr) {
const date = new Date(dateStr);
if (!isNaN(date.getTime())) {
return date;
}
}
}
}
return null;
}
private extractAuthor($: cheerio.CheerioAPI): string | undefined {
const selectors = [
'[rel="author"]',
'.author-name',
'.byline',
'[class*="author"]',
'[property="article:author"]',
];
for (const selector of selectors) {
const element = $(selector);
if (element.length) {
const author = selector.includes('property')
? element.attr('content')
: element.text();
if (author && author.trim()) {
return author.trim();
}
}
}
return undefined;
}
private extractTags($: cheerio.CheerioAPI): string[] {
const tags = new Set<string>();
// Extract from various tag selectors
const selectors = [
'.tags a',
'.tag-list a',
'[class*="tag"] a',
'.categories a',
'[rel="tag"]',
];
for (const selector of selectors) {
$(selector).each((_, el) => {
const tag = $(el).text().trim();
if (tag) {
tags.add(tag);
}
});
}
return Array.from(tags);
}
private async generateSummaryFromBody(body: string, title?: string): Promise<string> {
try {
// Try AI-powered summary generation first
const response = await axios.post('http://localhost:5000/api/generate-summary', {
content: body,
title: title || ''
}, {
timeout: 10000, // 10 second timeout
headers: {
'Content-Type': 'application/json'
}
});
if (response.data?.summary) {
console.log('✅ AI-generated summary created');
return response.data.summary;
}
} catch (error) {
console.warn('⚠️ AI summary generation failed, using fallback method:', error instanceof Error ? error.message : 'Unknown error');
}
// Fallback to basic summary generation (improved version)
const cleanedBody = body.replace(/^This article was originally published at .+?\.\n\n/i, '').trim();
const sentences = cleanedBody.split(/[.!?]+/);
let summary = '';
for (const sentence of sentences) {
const trimmed = sentence.trim();
if (trimmed.length < 10) continue; // Skip very short sentences
if (summary.length + trimmed.length > 150) break;
summary += (summary ? '. ' : '') + trimmed;
}
return (summary + (summary ? '.' : 'Content not available.')).substring(0, 150);
}
// Download and process images
async downloadImage(imageUrl: string, filename: string): Promise<ScrapedImage | null> {
try {
console.log(`Downloading image: ${imageUrl}`);
const response = await axios.get(imageUrl, {
responseType: 'stream',
headers: {
'User-Agent': WebScraper.USER_AGENT,
},
timeout: WebScraper.TIMEOUT,
});
const assetsDir = join(process.cwd(), 'attached_assets', 'scraped');
const imagePath = join(assetsDir, filename);
const writer = createWriteStream(imagePath);
response.data.pipe(writer);
await new Promise((resolve, reject) => {
writer.on('finish', resolve);
writer.on('error', reject);
});
// Get image metadata
const metadata = await sharp(imagePath).metadata();
console.log(`Successfully downloaded: ${filename} (${metadata.width}x${metadata.height})`);
return {
url: imageUrl,
filename,
width: metadata.width,
height: metadata.height,
};
} catch (error: any) {
console.error(`Error downloading image ${imageUrl}:`, error.message);
return null;
}
}
// Create thumbnail from downloaded image
async createThumbnail(imagePath: string, thumbnailPath: string, size = 300): Promise<boolean> {
try {
await sharp(imagePath)
.resize(size, size, {
fit: 'cover',
position: 'center',
})
.jpeg({ quality: 80 })
.toFile(thumbnailPath);
return true;
} catch (error: any) {
console.error(`Error creating thumbnail:`, error.message);
return false;
}
}
// Batch scraping with concurrency control - returns both successes and failures
async scrapeMultipleArticles(urls: string[], maxConcurrency = 5): Promise<{
successes: ScrapedArticle[];
failures: Array<{url: string; error: string}>;
}> {
const successes: ScrapedArticle[] = [];
const failures: Array<{url: string; error: string}> = [];
const chunks = this.chunkArray(urls, maxConcurrency);
for (const chunk of chunks) {
const promises = chunk.map(url =>
this.scrapeArticle(url)
.then(result => ({ url, result, error: null }))
.catch(error => ({ url, result: null, error: error.message || 'Unknown error' }))
);
const chunkResults = await Promise.all(promises);
// Separate successes and failures
for (const { url, result, error } of chunkResults) {
if (result) {
successes.push(result);
} else {
failures.push({ url, error: error || 'Failed to scrape' });
}
}
// Small delay between batches to be respectful to servers
await new Promise(resolve => setTimeout(resolve, 1000));
}
return { successes, failures };
}
private chunkArray<T>(array: T[], chunkSize: number): T[][] {
const chunks: T[][] = [];
for (let i = 0; i < array.length; i += chunkSize) {
chunks.push(array.slice(i, i + chunkSize));
}
return chunks;
}
}