// Comprehensive article scraping for all outlets from attached links const fs = require('fs'); const path = require('path'); // Mapping from names to outlet IDs in our system const nameToOutletIdMap = { // People 'Ala Shaabana': 'ala-shaabana', 'Alex Karp': 'alex-karp', 'Arthur Hayes': 'arthur-hayes', 'Donald Trump Jr.': 'donald-trump-jr', 'Eric Trump': 'eric-trump', 'Jacob Robert Steeves': 'jacob-robert-steeves', 'Jared Kushner': 'jared-kushner', 'J.D. Vance': 'jd-vance', 'Jensen Huang': 'jensen-huang', 'Jerome Powell': 'jerome-powell', 'Joseph Jacks': 'joseph-jacks', 'Larry Ellison': 'larry-ellison', 'Lily Liu': 'lily-liu', 'Marco Rubio': 'marco-rubio', 'Robert Myers': 'robert-myers', 'Sam Altman': 'sam-altman', 'Satya Nadella': 'satya-nadella', 'Scott Bessent': 'scott-bessent', 'Simon Kim': 'simon-kim', 'Yat Siu': 'yat-siu', // Topics 'AI': 'ai', 'Altcoin': 'alt-coin', 'Bollywood': 'bollywood', 'CantoPop': 'cantopop', 'CBDC': 'cbdc', 'CFTC': 'cftc', 'Crypto': 'crypto', 'Custody Regulation': 'custody-regulation', 'DAT': 'dat', 'Decentralized AI': 'decentralized-ai', 'DeFi': 'defi', 'DEX': 'dex', 'Fed': 'fed', 'FOMC': 'fomc', 'J-Star': 'j-star', 'K-Star': 'k-star', 'NFT': 'nft', 'RWA': 'rwa', 'SEC': 'sec', 'Stablecoin': 'stable-coin', 'SWF': 'swf', // Companies 'Ava Labs': 'ava-labs', 'Bittensor': 'bittensor', 'BlackRock': 'blackrock', 'Boston Dynamics': 'boston-dynamics', 'Chainlink': 'chainlink', 'Circle': 'circle', 'CME Group': 'cme-group', 'Epic Games': 'epic-games', 'Hashed': 'hashed', 'Hyperliquid': 'hyperliquid', 'Oblong': 'oblong', 'OpenSea': 'opensea', 'Palantir': 'palantir', 'PancakeSwap': 'pancakeswap', 'Polygon': 'polygon', 'Saudi Aramco': 'saudi-aramco', 'Solana Foundation': 'solana-foundation', 'TAOX': 'taox', 'TRON': 'tron', 'TSMC': 'tsmc', 'Uniswap': 'uniswap', 'World Liberty Financial': 'world-liberty-financial', 'xTAO': 'xtao', 'YUMA': 'yuma' }; // Article links organized by outlet const outletsAndUrls = { // People - Ala Shaabana 'ala-shaabana': [ 'https://www.rootdata.com/news/323625', 'https://ffnews.com/newsarticle/funding/xtao-tsx-venture-listing/', 'https://stealthex.io/blog/bittensor-price-prediction-can-tao-coin-reach-1000/', 'https://www.gate.com/learn/articles/understanding-bittensor-protocol/2203', 'https://www.investing.com/news/cryptocurrency-news/b-dash-ventures-and-hashed-announce-sponsors-and-main-speakers-for-blockchain-leaders-summit-tokyo-2025-4198036' ], // Alex Karp 'alex-karp': [ 'https://www.thestreet.com/technology/salesforce-ceo-praises-palantir-as-it-closes-950m-uk-defense-deal', 'https://www.msn.com/en-us/money/topstocks/palantir-ceo-alexander-karp-s-new-plan-to-sell-1-2-billion-of-stock/ar-AA1zn2AU', 'https://www.aol.com/palantir-stock-investors-just-got-093000800.html', 'https://www.benzinga.com/markets/equities/25/08/47296576/palantir-ceo-alex-karp-dumps-63-million-in-stock-as-pltr-surges-111-this-year', 'https://finance.yahoo.com/news/palantir-ceo-alex-karp-just-090200616.html' ], // Arthur Hayes 'arthur-hayes': [ 'https://www.ccn.com/news/crypto/arthur-hayes-hype-pivotal-momen-ultra-bullish/', 'https://cointelegraph.com/news/arthur-hayes-sold-all-hype-ferrari-testarossa', 'https://cryptonews.com/news/bitmex-co-founder-arthur-hayes-dumps-entire-hype-bag-for-a-ferrari/', 'https://www.cryptopolitan.com/arthur-hayes-sells-5-1m-in-hype/', 'https://coincentral.com/hayes-dumps-5-1m-hype-position-shortly-after-making-126x-price-call/' ], // Jacob Robert Steeves 'jacob-robert-steeves': [ 'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/', 'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/', 'https://www.globenewswire.com/news-release/2025/04/29/3070182/0/en/Alpha-Sigma-Capital-Research-Publishes-New-Report-on-Bittensor-TAO-Decentralized-Neural-Internet-Model.html', 'https://www.chainup.com/market-update/bittensor-the-ai-alpha/', 'https://usethebitcoin.com/crypto-personalities/all-you-need-to-know-about-jacob-robert-steeves-the-co-founder-of-bittensor/' ], // Joseph Jacks 'joseph-jacks': [ 'https://fox4kc.com/business/press-releases/ein-presswire/842930120/cossa-launches-as-the-definitive-organization-for-the-26b-commercial-open-source-market', 'https://www.prnewswire.com/news-releases/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-focused-crypto-treasury-strategy-302538426.html', 'https://www.ainvest.com/news/joseph-jacks-joins-tao-synergies-advisor-ai-focused-crypto-treasury-strategy-2508/', 'https://fintech.global/2025/08/01/comp-ai-secures-2-6m-to-transform-soc-2-compliance/', 'https://www.stocktitan.net/news/TAOX/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-641ecubmt9fz.html' ], // Robert Myers 'robert-myers': [ 'https://oss.capital/news/', 'https://www.proofoftalk.io/speakers/robert-myers', 'https://coinfomania.com/singularitynet-ceo-ben-goertzel-teams-up-with-fetch-ais-humayun-sheikh-to-explore-decentralized-artificial-intelligence-at-proof-of-talk-2025/', 'https://www.theblock.co/post/353065/bittensor-tao-token-crypto-investors' ], // Topics - AI 'ai': [ 'https://www.crescendo.ai/news/latest-ai-news-and-updates', 'https://www.reuters.com/technology/artificial-intelligence/', 'https://www.marketingprofs.com/opinions/2025/53723/ai-update-september-19-2025-ai-news-and-views-from-the-past-week', 'https://www.wndu.com/2025/09/19/artificial-intelligence-ai-update-google-microsoft-apple-meta-quantum-technology-finance/', 'https://solutionsreview.com/artificial-intelligence-news-for-the-week-of-september-19-updates-from-druid-ai-dxc-technology-g-p-more/' ], // Alt Coin 'alt-coin': [ 'https://cryptopotato.com/bitcoin-joins-the-altcoin-bloodbath-with-a-sudden-flash-crash-to-112k/', 'https://www.ainvest.com/news/michael-saylor-strategic-move-september-crypto-presale-outperform-bitcoin-2509/', 'https://coinpaper.com/11193/5-signs-the-2025-altseason-could-be-bigger-than-ever', 'https://cryptodnes.bg/en/analyst-says-2025-altcoin-rally-could-mirror-past-surges-here-is-why/', 'https://tangem.com/en/blog/post/what-is-altseason/' ], // Stable Coin 'stable-coin': [ 'https://www.coindesk.com/markets/2025/09/19/u-s-stablecoin-battle-could-be-zero-sum-game-jpmorgan', 'https://www.circle.com/usdc', 'https://www.theblock.co/post/370543/tethers-hedge-and-expand-us-strategy-puts-circle-on-defense-in-market-shake-up-tests-oversight-versus-privacy', 'https://breet.io/blog/usdt-vs-usdc', 'https://www.dlnews.com/articles/markets/tether-faces-uphill-battle-launching-usat-stablecoin-in-us/' ], // Bittensor 'bittensor': [ 'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/', 'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/', 'https://www.chainup.com/market-update/bittensor-the-ai-alpha/' ] }; async function scrapeArticle(url) { try { console.log(` Scraping: ${url}`); const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const html = await response.text(); // Extract title with better regex const titleMatches = [ html.match(/]*>([^<]+)<\/title>/i), html.match(/]*>([^<]+)<\/h1>/i), html.match(/]*property=["\']og:title["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i) ]; let title = 'Untitled Article'; for (const match of titleMatches) { if (match && match[1]) { title = match[1].trim().replace(/\s+/g, ' ').substring(0, 200); break; } } // Clean up title title = title.replace(/\|.*$/, '').replace(/-.*$/, '').trim(); if (title.length > 150) { title = title.substring(0, 150) + '...'; } // Extract meta description or summary const summaryMatches = [ html.match(/]*name=["\']description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i), html.match(/]*property=["\']og:description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i), html.match(/]*name=["\']twitter:description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i) ]; let summary = 'Article summary not available.'; for (const match of summaryMatches) { if (match && match[1]) { summary = match[1].trim().replace(/\s+/g, ' ').substring(0, 500); break; } } // Try to extract main content paragraphs const contentMatches = [ ...html.matchAll(/]*>([^<]+(?:<[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/gi) ]; let bodyParagraphs = []; for (const match of contentMatches) { if (match[1]) { const cleanText = match[1] .replace(/<[^>]*>/g, ' ') .replace(/\s+/g, ' ') .trim(); if (cleanText.length > 30 && !cleanText.includes('cookie') && !cleanText.includes('subscribe')) { bodyParagraphs.push(cleanText); } } } // Create body content let body = `This article was originally published at ${url}.\n\n${summary}`; if (bodyParagraphs.length > 0) { const selectedParagraphs = bodyParagraphs.slice(0, 3); body += `\n\n` + selectedParagraphs.join('\n\n'); } else { body += `\n\nThis content provides insights and analysis on current industry developments and trends in the cryptocurrency, technology, and financial sectors.`; } // Extract main image let thumbnail = '/api/assets/default-article.png'; const imageMatches = [ html.match(/]*property=["\']og:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i), html.match(/]*name=["\']twitter:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i), html.match(/]*src=["\']([^"']+)["\'][^>]*>/i) ]; for (const match of imageMatches) { if (match && match[1] && match[1].startsWith('http')) { thumbnail = match[1]; break; } } return { title, summary, body, url, thumbnail }; } catch (error) { console.error(` āœ— Error scraping ${url}:`, error.message); return null; } } async function addArticleToStorage(outletId, articleData) { try { const response = await fetch('http://localhost:5000/api/articles', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ outletId, title: articleData.title, summary: articleData.summary, body: articleData.body, thumbnail: articleData.thumbnail, publishedAt: new Date().toISOString(), tags: [], viewCount: 0 }) }); if (response.ok) { const result = await response.json(); console.log(` āœ“ Added: ${articleData.title.substring(0, 80)}...`); return result; } else { const errorText = await response.text(); console.error(` āœ— Failed to add article (${response.status}): ${errorText}`); return null; } } catch (error) { console.error(` āœ— Error adding article:`, error.message); return null; } } async function main() { console.log('šŸš€ Starting comprehensive article scraping...\n'); let totalScraped = 0; let totalAdded = 0; for (const [outletId, urls] of Object.entries(outletsAndUrls)) { console.log(`\nšŸ“° === Scraping articles for ${outletId} ===`); console.log(` Found ${urls.length} URLs to scrape`); let articleCount = 0; for (const url of urls.slice(0, 5)) { // Limit to 5 articles per outlet totalScraped++; const articleData = await scrapeArticle(url); if (articleData) { const result = await addArticleToStorage(outletId, articleData); if (result) { totalAdded++; articleCount++; } } // Respectful delay await new Promise(resolve => setTimeout(resolve, 1500)); } console.log(` āœ… Added ${articleCount} articles for ${outletId}`); } console.log(`\nšŸŽ‰ === Scraping Summary ===`); console.log(` šŸ“Š Total URLs processed: ${totalScraped}`); console.log(` āœ… Articles successfully added: ${totalAdded}`); console.log(` šŸ“ˆ Success rate: ${Math.round((totalAdded / totalScraped) * 100)}%`); console.log(`\n✨ Comprehensive scraping completed!`); } // Run the scraper main().catch(console.error);