// Comprehensive article scraping for all outlets from attached links const fs = require('fs'); const path = require('path'); // Mapping from names to outlet IDs in our system const nameToOutletIdMap = { // People 'Ala Shaabana': 'ala-shaabana', 'Alex Karp': 'alex-karp', 'Arthur Hayes': 'arthur-hayes', 'Donald Trump Jr.': 'donald-trump-jr', 'Eric Trump': 'eric-trump', 'Jacob Robert Steeves': 'jacob-robert-steeves', 'Jared Kushner': 'jared-kushner', 'J.D. Vance': 'jd-vance', 'Jensen Huang': 'jensen-huang', 'Jerome Powell': 'jerome-powell', 'Joseph Jacks': 'joseph-jacks', 'Larry Ellison': 'larry-ellison', 'Lily Liu': 'lily-liu', 'Marco Rubio': 'marco-rubio', 'Robert Myers': 'robert-myers', 'Sam Altman': 'sam-altman', 'Satya Nadella': 'satya-nadella', 'Scott Bessent': 'scott-bessent', 'Simon Kim': 'simon-kim', 'Yat Siu': 'yat-siu', // Topics 'AI': 'ai', 'Altcoin': 'alt-coin', 'Bollywood': 'bollywood', 'CantoPop': 'cantopop', 'CBDC': 'cbdc', 'CFTC': 'cftc', 'Crypto': 'crypto', 'Custody Regulation': 'custody-regulation', 'DAT': 'dat', 'Decentralized AI': 'decentralized-ai', 'DeFi': 'defi', 'DEX': 'dex', 'Fed': 'fed', 'FOMC': 'fomc', 'J-Star': 'j-star', 'K-Star': 'k-star', 'NFT': 'nft', 'RWA': 'rwa', 'SEC': 'sec', 'Stablecoin': 'stable-coin', 'SWF': 'swf', // Companies 'Ava Labs': 'ava-labs', 'Bittensor': 'bittensor', 'BlackRock': 'blackrock', 'Boston Dynamics': 'boston-dynamics', 'Chainlink': 'chainlink', 'Circle': 'circle', 'CME Group': 'cme-group', 'Epic Games': 'epic-games', 'Hashed': 'hashed', 'Hyperliquid': 'hyperliquid', 'Oblong': 'oblong', 'OpenSea': 'opensea', 'Palantir': 'palantir', 'PancakeSwap': 'pancakeswap', 'Polygon': 'polygon', 'Saudi Aramco': 'saudi-aramco', 'Solana Foundation': 'solana-foundation', 'TAOX': 'taox', 'TRON': 'tron', 'TSMC': 'tsmc', 'Uniswap': 'uniswap', 'World Liberty Financial': 'world-liberty-financial', 'xTAO': 'xtao', 'YUMA': 'yuma' }; // Article links organized by outlet const outletsAndUrls = { // People - Ala Shaabana 'ala-shaabana': [ 'https://www.rootdata.com/news/323625', 'https://ffnews.com/newsarticle/funding/xtao-tsx-venture-listing/', 'https://stealthex.io/blog/bittensor-price-prediction-can-tao-coin-reach-1000/', 'https://www.gate.com/learn/articles/understanding-bittensor-protocol/2203', 'https://www.investing.com/news/cryptocurrency-news/b-dash-ventures-and-hashed-announce-sponsors-and-main-speakers-for-blockchain-leaders-summit-tokyo-2025-4198036' ], // Alex Karp 'alex-karp': [ 'https://www.thestreet.com/technology/salesforce-ceo-praises-palantir-as-it-closes-950m-uk-defense-deal', 'https://www.msn.com/en-us/money/topstocks/palantir-ceo-alexander-karp-s-new-plan-to-sell-1-2-billion-of-stock/ar-AA1zn2AU', 'https://www.aol.com/palantir-stock-investors-just-got-093000800.html', 'https://www.benzinga.com/markets/equities/25/08/47296576/palantir-ceo-alex-karp-dumps-63-million-in-stock-as-pltr-surges-111-this-year', 'https://finance.yahoo.com/news/palantir-ceo-alex-karp-just-090200616.html' ], // Arthur Hayes 'arthur-hayes': [ 'https://www.ccn.com/news/crypto/arthur-hayes-hype-pivotal-momen-ultra-bullish/', 'https://cointelegraph.com/news/arthur-hayes-sold-all-hype-ferrari-testarossa', 'https://cryptonews.com/news/bitmex-co-founder-arthur-hayes-dumps-entire-hype-bag-for-a-ferrari/', 'https://www.cryptopolitan.com/arthur-hayes-sells-5-1m-in-hype/', 'https://coincentral.com/hayes-dumps-5-1m-hype-position-shortly-after-making-126x-price-call/' ], // Jacob Robert Steeves 'jacob-robert-steeves': [ 'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/', 'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/', 'https://www.globenewswire.com/news-release/2025/04/29/3070182/0/en/Alpha-Sigma-Capital-Research-Publishes-New-Report-on-Bittensor-TAO-Decentralized-Neural-Internet-Model.html', 'https://www.chainup.com/market-update/bittensor-the-ai-alpha/', 'https://usethebitcoin.com/crypto-personalities/all-you-need-to-know-about-jacob-robert-steeves-the-co-founder-of-bittensor/' ], // Joseph Jacks 'joseph-jacks': [ 'https://fox4kc.com/business/press-releases/ein-presswire/842930120/cossa-launches-as-the-definitive-organization-for-the-26b-commercial-open-source-market', 'https://www.prnewswire.com/news-releases/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-focused-crypto-treasury-strategy-302538426.html', 'https://www.ainvest.com/news/joseph-jacks-joins-tao-synergies-advisor-ai-focused-crypto-treasury-strategy-2508/', 'https://fintech.global/2025/08/01/comp-ai-secures-2-6m-to-transform-soc-2-compliance/', 'https://www.stocktitan.net/news/TAOX/tao-synergies-welcomes-top-bittensor-tao-leader-as-advisor-for-ai-641ecubmt9fz.html' ], // Robert Myers 'robert-myers': [ 'https://oss.capital/news/', 'https://www.proofoftalk.io/speakers/robert-myers', 'https://coinfomania.com/singularitynet-ceo-ben-goertzel-teams-up-with-fetch-ais-humayun-sheikh-to-explore-decentralized-artificial-intelligence-at-proof-of-talk-2025/', 'https://www.theblock.co/post/353065/bittensor-tao-token-crypto-investors' ], // Topics - AI 'ai': [ 'https://www.crescendo.ai/news/latest-ai-news-and-updates', 'https://www.reuters.com/technology/artificial-intelligence/', 'https://www.marketingprofs.com/opinions/2025/53723/ai-update-september-19-2025-ai-news-and-views-from-the-past-week', 'https://www.wndu.com/2025/09/19/artificial-intelligence-ai-update-google-microsoft-apple-meta-quantum-technology-finance/', 'https://solutionsreview.com/artificial-intelligence-news-for-the-week-of-september-19-updates-from-druid-ai-dxc-technology-g-p-more/' ], // Alt Coin 'alt-coin': [ 'https://cryptopotato.com/bitcoin-joins-the-altcoin-bloodbath-with-a-sudden-flash-crash-to-112k/', 'https://www.ainvest.com/news/michael-saylor-strategic-move-september-crypto-presale-outperform-bitcoin-2509/', 'https://coinpaper.com/11193/5-signs-the-2025-altseason-could-be-bigger-than-ever', 'https://cryptodnes.bg/en/analyst-says-2025-altcoin-rally-could-mirror-past-surges-here-is-why/', 'https://tangem.com/en/blog/post/what-is-altseason/' ], // Stable Coin 'stable-coin': [ 'https://www.coindesk.com/markets/2025/09/19/u-s-stablecoin-battle-could-be-zero-sum-game-jpmorgan', 'https://www.circle.com/usdc', 'https://www.theblock.co/post/370543/tethers-hedge-and-expand-us-strategy-puts-circle-on-defense-in-market-shake-up-tests-oversight-versus-privacy', 'https://breet.io/blog/usdt-vs-usdc', 'https://www.dlnews.com/articles/markets/tether-faces-uphill-battle-launching-usat-stablecoin-in-us/' ], // Bittensor 'bittensor': [ 'https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/', 'https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/', 'https://www.chainup.com/market-update/bittensor-the-ai-alpha/' ] }; async function scrapeArticle(url) { try { console.log(` Scraping: ${url}`); const response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const html = await response.text(); // Extract title with better regex const titleMatches = [ html.match(/
]*>([^<]+(?:<[^>]*>[^<]*<\/[^>]*>[^<]*)*)<\/p>/gi)
];
let bodyParagraphs = [];
for (const match of contentMatches) {
if (match[1]) {
const cleanText = match[1]
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
if (cleanText.length > 30 && !cleanText.includes('cookie') && !cleanText.includes('subscribe')) {
bodyParagraphs.push(cleanText);
}
}
}
// Create body content
let body = `This article was originally published at ${url}.\n\n${summary}`;
if (bodyParagraphs.length > 0) {
const selectedParagraphs = bodyParagraphs.slice(0, 3);
body += `\n\n` + selectedParagraphs.join('\n\n');
} else {
body += `\n\nThis content provides insights and analysis on current industry developments and trends in the cryptocurrency, technology, and financial sectors.`;
}
// Extract main image
let thumbnail = '/api/assets/default-article.png';
const imageMatches = [
html.match(/]*property=["\']og:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
html.match(/]*name=["\']twitter:image["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i),
html.match(/]*src=["\']([^"']+)["\'][^>]*>/i)
];
for (const match of imageMatches) {
if (match && match[1] && match[1].startsWith('http')) {
thumbnail = match[1];
break;
}
}
return {
title,
summary,
body,
url,
thumbnail
};
} catch (error) {
console.error(` ā Error scraping ${url}:`, error.message);
return null;
}
}
async function addArticleToStorage(outletId, articleData) {
try {
const response = await fetch('http://localhost:5000/api/articles', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
outletId,
title: articleData.title,
summary: articleData.summary,
body: articleData.body,
thumbnail: articleData.thumbnail,
publishedAt: new Date().toISOString(),
tags: [],
viewCount: 0
})
});
if (response.ok) {
const result = await response.json();
console.log(` ā Added: ${articleData.title.substring(0, 80)}...`);
return result;
} else {
const errorText = await response.text();
console.error(` ā Failed to add article (${response.status}): ${errorText}`);
return null;
}
} catch (error) {
console.error(` ā Error adding article:`, error.message);
return null;
}
}
async function main() {
console.log('š Starting comprehensive article scraping...\n');
let totalScraped = 0;
let totalAdded = 0;
for (const [outletId, urls] of Object.entries(outletsAndUrls)) {
console.log(`\nš° === Scraping articles for ${outletId} ===`);
console.log(` Found ${urls.length} URLs to scrape`);
let articleCount = 0;
for (const url of urls.slice(0, 5)) { // Limit to 5 articles per outlet
totalScraped++;
const articleData = await scrapeArticle(url);
if (articleData) {
const result = await addArticleToStorage(outletId, articleData);
if (result) {
totalAdded++;
articleCount++;
}
}
// Respectful delay
await new Promise(resolve => setTimeout(resolve, 1500));
}
console.log(` ā
Added ${articleCount} articles for ${outletId}`);
}
console.log(`\nš === Scraping Summary ===`);
console.log(` š Total URLs processed: ${totalScraped}`);
console.log(` ā
Articles successfully added: ${totalAdded}`);
console.log(` š Success rate: ${Math.round((totalAdded / totalScraped) * 100)}%`);
console.log(`\n⨠Comprehensive scraping completed!`);
}
// Run the scraper
main().catch(console.error);