feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform. Consolidated all previous history into single commit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
349
comprehensive-scraping.cjs
Normal file
349
comprehensive-scraping.cjs
Normal file
@ -0,0 +1,349 @@
|
||||
// Comprehensive article scraping for all outlets from attached links
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Mapping from names to outlet IDs in our system
|
||||
const nameToOutletIdMap = {
|
||||
// People
|
||||
'Ala Shaabana': 'ala-shaabana',
|
||||
'Alex Karp': 'alex-karp',
|
||||
'Arthur Hayes': 'arthur-hayes',
|
||||
'Donald Trump Jr.': 'donald-trump-jr',
|
||||
'Eric Trump': 'eric-trump',
|
||||
'Jacob Robert Steeves': 'jacob-robert-steeves',
|
||||
'Jared Kushner': 'jared-kushner',
|
||||
'J.D. Vance': 'jd-vance',
|
||||
'Jensen Huang': 'jensen-huang',
|
||||
'Jerome Powell': 'jerome-powell',
|
||||
'Joseph Jacks': 'joseph-jacks',
|
||||
'Larry Ellison': 'larry-ellison',
|
||||
'Lily Liu': 'lily-liu',
|
||||
'Marco Rubio': 'marco-rubio',
|
||||
'Robert Myers': 'robert-myers',
|
||||
'Sam Altman': 'sam-altman',
|
||||
'Satya Nadella': 'satya-nadella',
|
||||
'Scott Bessent': 'scott-bessent',
|
||||
'Simon Kim': 'simon-kim',
|
||||
'Yat Siu': 'yat-siu',
|
||||
|
||||
// Topics
|
||||
'AI': 'ai',
|
||||
'Altcoin': 'alt-coin',
|
||||
'Bollywood': 'bollywood',
|
||||
'CantoPop': 'cantopop',
|
||||
'CBDC': 'cbdc',
|
||||
'CFTC': 'cftc',
|
||||
'Crypto': 'crypto',
|
||||
'Custody Regulation': 'custody-regulation',
|
||||
'DAT': 'dat',
|
||||
'Decentralized AI': 'decentralized-ai',
|
||||
'DeFi': 'defi',
|
||||
'DEX': 'dex',
|
||||
'Fed': 'fed',
|
||||
'FOMC': 'fomc',
|
||||
'J-Star': 'j-star',
|
||||
'K-Star': 'k-star',
|
||||
'NFT': 'nft',
|
||||
'RWA': 'rwa',
|
||||
'SEC': 'sec',
|
||||
'Stablecoin': 'stable-coin',
|
||||
'SWF': 'swf',
|
||||
|
||||
// Companies
|
||||
'Ava Labs': 'ava-labs',
|
||||
'Bittensor': 'bittensor',
|
||||
'BlackRock': 'blackrock',
|
||||
'Boston Dynamics': 'boston-dynamics',
|
||||
'Chainlink': 'chainlink',
|
||||
'Circle': 'circle',
|
||||
'CME Group': 'cme-group',
|
||||
'Epic Games': 'epic-games',
|
||||
'Hashed': 'hashed',
|
||||
'Hyperliquid': 'hyperliquid',
|
||||
'Oblong': 'oblong',
|
||||
'OpenSea': 'opensea',
|
||||
'Palantir': 'palantir',
|
||||
'PancakeSwap': 'pancakeswap',
|
||||
'Polygon': 'polygon',
|
||||
'Saudi Aramco': 'saudi-aramco',
|
||||
'Solana Foundation': 'solana-foundation',
|
||||
'TAOX': 'taox',
|
||||
'TRON': 'tron',
|
||||
'TSMC': 'tsmc',
|
||||
'Uniswap': 'uniswap',
|
||||
'World Liberty Financial': 'world-liberty-financial',
|
||||
'xTAO': 'xtao',
|
||||
'YUMA': 'yuma'
|
||||
};
|
||||
|
||||
// Article links organized by outlet
|
||||
const outletsAndUrls = {
|
||||
// People - Ala Shaabana
|
||||
'ala-shaabana': [
|
||||
'https://www.rootdata.com/news/323625',
|
||||
'https://ffnews.com/newsarticle/funding/xtao-tsx-venture-listing/',
|
||||
'https://stealthex.io/blog/bittensor-price-prediction-can-tao-coin-reach-1000/',
|
||||
'https://www.gate.com/learn/articles/understanding-bittensor-protocol/2203',
|
||||
'https://www.investing.com/news/cryptocurrency-news/b-dash-ventures-and-hashed-announce-sponsors-and-main-speakers-for-blockchain-leaders-summit-tokyo-2025-4198036'
|
||||
],
|
||||
|
||||
// Alex Karp
|
||||
'alex-karp': [
|
||||
'https://www.thestreet.com/technology/salesforce-ceo-praises-palantir-as-it-closes-950m-uk-defense-deal',
|
||||
'https://www.msn.com/en-us/money/topstocks/palantir-ceo-alexander-karp-s-new-plan-to-sell-1-2-billion-of-stock/ar-AA1zn2AU',
|
||||
'https://www.aol.com/palantir-stock-investors-just-got-093000800.html',
|
||||
'https://www.benzinga.com/markets/equities/25/08/47296576/palantir-ceo-alex-karp-dumps-63-million-in-stock-as-pltr-surges-111-this-year',
|
||||
'https://finance.yahoo.com/news/palantir-ceo-alex-karp-just-090200616.html'
|
||||
],
|
||||
|
||||
// Arthur Hayes
|
||||
'arthur-hayes': [
|
||||
'https://www.ccn.com/news/crypto/arthur-hayes-hype-pivotal-momen-ultra-bullish/',
|
||||
'https://cointelegraph.com/news/arthur-hayes-sold-all-hype-ferrari-testarossa',
|
||||
'https://cryptonews.com/news/bitmex-co-founder-arthur-hayes-dumps-entire-hype-bag-for-a-ferrari/',
|
||||
'https://www.cryptopolitan.com/arthur-hayes-sells-5-1m-in-hype/',
|
||||
'https://coincentral.com/hayes-dumps-5-1m-hype-position-shortly-after-making-126x-price-call/'
|
||||
],
|
||||
|
||||
// Jacob Robert Steeves
|
||||
'jacob-robert-steeves': [
|
||||
'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/',
|
||||
'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/',
|
||||
'https://www.globenewswire.com/news-release/2025/04/29/3070182/0/en/Alpha-Sigma-Capital-Research-Publishes-New-Report-on-Bittensor-TAO-Decentralized-Neural-Internet-Model.html',
|
||||
'https://www.chainup.com/market-update/bittensor-the-ai-alpha/',
|
||||
'https://usethebitcoin.com/crypto-personalities/all-you-need-to-know-about-jacob-robert-steeves-the-co-founder-of-bittensor/'
|
||||
],
|
||||
|
||||
// Joseph Jacks
|
||||
'joseph-jacks': [
|
||||
'https://fox4kc.com/business/press-releases/ein-presswire/842930120/cossa-launches-as-the-definitive-organization-for-the-26b-commercial-open-source-market',
|
||||
'https://www.prnewswire.com/news-releases/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-focused-crypto-treasury-strategy-302538426.html',
|
||||
'https://www.ainvest.com/news/joseph-jacks-joins-tao-synergies-advisor-ai-focused-crypto-treasury-strategy-2508/',
|
||||
'https://fintech.global/2025/08/01/comp-ai-secures-2-6m-to-transform-soc-2-compliance/',
|
||||
'https://www.stocktitan.net/news/TAOX/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-641ecubmt9fz.html'
|
||||
],
|
||||
|
||||
// Robert Myers
|
||||
'robert-myers': [
|
||||
'https://oss.capital/news/',
|
||||
'https://www.proofoftalk.io/speakers/robert-myers',
|
||||
'https://coinfomania.com/singularitynet-ceo-ben-goertzel-teams-up-with-fetch-ais-humayun-sheikh-to-explore-decentralized-artificial-intelligence-at-proof-of-talk-2025/',
|
||||
'https://www.theblock.co/post/353065/bittensor-tao-token-crypto-investors'
|
||||
],
|
||||
|
||||
// Topics - AI
|
||||
'ai': [
|
||||
'https://www.crescendo.ai/news/latest-ai-news-and-updates',
|
||||
'https://www.reuters.com/technology/artificial-intelligence/',
|
||||
'https://www.marketingprofs.com/opinions/2025/53723/ai-update-september-19-2025-ai-news-and-views-from-the-past-week',
|
||||
'https://www.wndu.com/2025/09/19/artificial-intelligence-ai-update-google-microsoft-apple-meta-quantum-technology-finance/',
|
||||
'https://solutionsreview.com/artificial-intelligence-news-for-the-week-of-september-19-updates-from-druid-ai-dxc-technology-g-p-more/'
|
||||
],
|
||||
|
||||
// Alt Coin
|
||||
'alt-coin': [
|
||||
'https://cryptopotato.com/bitcoin-joins-the-altcoin-bloodbath-with-a-sudden-flash-crash-to-112k/',
|
||||
'https://www.ainvest.com/news/michael-saylor-strategic-move-september-crypto-presale-outperform-bitcoin-2509/',
|
||||
'https://coinpaper.com/11193/5-signs-the-2025-altseason-could-be-bigger-than-ever',
|
||||
'https://cryptodnes.bg/en/analyst-says-2025-altcoin-rally-could-mirror-past-surges-here-is-why/',
|
||||
'https://tangem.com/en/blog/post/what-is-altseason/'
|
||||
],
|
||||
|
||||
// Stable Coin
|
||||
'stable-coin': [
|
||||
'https://www.coindesk.com/markets/2025/09/19/u-s-stablecoin-battle-could-be-zero-sum-game-jpmorgan',
|
||||
'https://www.circle.com/usdc',
|
||||
'https://www.theblock.co/post/370543/tethers-hedge-and-expand-us-strategy-puts-circle-on-defense-in-market-shake-up-tests-oversight-versus-privacy',
|
||||
'https://breet.io/blog/usdt-vs-usdc',
|
||||
'https://www.dlnews.com/articles/markets/tether-faces-uphill-battle-launching-usat-stablecoin-in-us/'
|
||||
],
|
||||
|
||||
// Bittensor
|
||||
'bittensor': [
|
||||
'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/',
|
||||
'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/',
|
||||
'https://www.chainup.com/market-update/bittensor-the-ai-alpha/'
|
||||
]
|
||||
};
|
||||
|
||||
async function scrapeArticle(url) {
|
||||
try {
|
||||
console.log(` Scraping: ${url}`);
|
||||
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
// Extract title with better regex
|
||||
const titleMatches = [
|
||||
html.match(/<title[^>]*>([^<]+)<\/title>/i),
|
||||
html.match(/<h1[^>]*>([^<]+)<\/h1>/i),
|
||||
html.match(/<meta[^>]*property=["\']og:title["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i)
|
||||
];
|
||||
|
||||
let title = 'Untitled Article';
|
||||
for (const match of titleMatches) {
|
||||
if (match && match[1]) {
|
||||
title = match[1].trim().replace(/\s+/g, ' ').substring(0, 200);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up title
|
||||
title = title.replace(/\|.*$/, '').replace(/-.*$/, '').trim();
|
||||
if (title.length > 150) {
|
||||
title = title.substring(0, 150) + '...';
|
||||
}
|
||||
|
||||
// Extract meta description or summary
|
||||
const summaryMatches = [
|
||||
html.match(/<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
|
||||
html.match(/<meta[^>]*property=["\']og:description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
|
||||
html.match(/<meta[^>]*name=["\']twitter:description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i)
|
||||
];
|
||||
|
||||
let summary = 'Article summary not available.';
|
||||
for (const match of summaryMatches) {
|
||||
if (match && match[1]) {
|
||||
summary = match[1].trim().replace(/\s+/g, ' ').substring(0, 500);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to extract main content paragraphs
|
||||
const contentMatches = [
|
||||
...html.matchAll(/<p[^>]*>([^<]+(?:<[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/gi)
|
||||
];
|
||||
|
||||
let bodyParagraphs = [];
|
||||
for (const match of contentMatches) {
|
||||
if (match[1]) {
|
||||
const cleanText = match[1]
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
if (cleanText.length > 30 && !cleanText.includes('cookie') && !cleanText.includes('subscribe')) {
|
||||
bodyParagraphs.push(cleanText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create body content
|
||||
let body = `This article was originally published at ${url}.\n\n${summary}`;
|
||||
|
||||
if (bodyParagraphs.length > 0) {
|
||||
const selectedParagraphs = bodyParagraphs.slice(0, 3);
|
||||
body += `\n\n` + selectedParagraphs.join('\n\n');
|
||||
} else {
|
||||
body += `\n\nThis content provides insights and analysis on current industry developments and trends in the cryptocurrency, technology, and financial sectors.`;
|
||||
}
|
||||
|
||||
// Extract main image
|
||||
let thumbnail = '/api/assets/default-article.png';
|
||||
const imageMatches = [
|
||||
html.match(/<meta[^>]*property=["\']og:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
|
||||
html.match(/<meta[^>]*name=["\']twitter:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
|
||||
html.match(/<img[^>]*src=["\']([^"']+)["\'][^>]*>/i)
|
||||
];
|
||||
|
||||
for (const match of imageMatches) {
|
||||
if (match && match[1] && match[1].startsWith('http')) {
|
||||
thumbnail = match[1];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
title,
|
||||
summary,
|
||||
body,
|
||||
url,
|
||||
thumbnail
|
||||
};
|
||||
|
||||
} catch (error) {
|
||||
console.error(` ✗ Error scraping ${url}:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function addArticleToStorage(outletId, articleData) {
|
||||
try {
|
||||
const response = await fetch('http://localhost:5000/api/articles', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
outletId,
|
||||
title: articleData.title,
|
||||
summary: articleData.summary,
|
||||
body: articleData.body,
|
||||
thumbnail: articleData.thumbnail,
|
||||
publishedAt: new Date().toISOString(),
|
||||
tags: [],
|
||||
viewCount: 0
|
||||
})
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const result = await response.json();
|
||||
console.log(` ✓ Added: ${articleData.title.substring(0, 80)}...`);
|
||||
return result;
|
||||
} else {
|
||||
const errorText = await response.text();
|
||||
console.error(` ✗ Failed to add article (${response.status}): ${errorText}`);
|
||||
return null;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(` ✗ Error adding article:`, error.message);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('🚀 Starting comprehensive article scraping...\n');
|
||||
|
||||
let totalScraped = 0;
|
||||
let totalAdded = 0;
|
||||
|
||||
for (const [outletId, urls] of Object.entries(outletsAndUrls)) {
|
||||
console.log(`\n📰 === Scraping articles for ${outletId} ===`);
|
||||
console.log(` Found ${urls.length} URLs to scrape`);
|
||||
|
||||
let articleCount = 0;
|
||||
|
||||
for (const url of urls.slice(0, 5)) { // Limit to 5 articles per outlet
|
||||
totalScraped++;
|
||||
|
||||
const articleData = await scrapeArticle(url);
|
||||
if (articleData) {
|
||||
const result = await addArticleToStorage(outletId, articleData);
|
||||
if (result) {
|
||||
totalAdded++;
|
||||
articleCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// Respectful delay
|
||||
await new Promise(resolve => setTimeout(resolve, 1500));
|
||||
}
|
||||
|
||||
console.log(` ✅ Added ${articleCount} articles for ${outletId}`);
|
||||
}
|
||||
|
||||
console.log(`\n🎉 === Scraping Summary ===`);
|
||||
console.log(` 📊 Total URLs processed: ${totalScraped}`);
|
||||
console.log(` ✅ Articles successfully added: ${totalAdded}`);
|
||||
console.log(` 📈 Success rate: ${Math.round((totalAdded / totalScraped) * 100)}%`);
|
||||
console.log(`\n✨ Comprehensive scraping completed!`);
|
||||
}
|
||||
|
||||
// Run the scraper
|
||||
main().catch(console.error);
|
||||
Reference in New Issue
Block a user