// Simple article scraping script for testing
const fs = require('fs');
// List of articles to scrape for each outlet
const outletsAndUrls = {
"ala-shaabana": [
"https://www.rootdata.com/news/323625",
"https://stealthex.io/blog/bittensor-price-prediction-can-tao-coin-reach-1000/",
"https://www.gate.com/learn/articles/understanding-bittensor-protocol/2203"
],
"jacob-robert-steeves": [
"https://eng.ambcrypto.com/will-ai-coin-tao-reach-3000-as-its-first-halving-approaches/",
"https://news.ssbcrack.com/bittensors-tao-coin-can-it-really-hit-3000-amid-upcoming-halving-and-ai-buzz/",
"https://usethebitcoin.com/crypto-personalities/all-you-need-to-know-about-jacob-robert-steeves-the-co-founder-of-bittensor/"
]
};
async function scrapeArticle(url) {
try {
const response = await fetch(url);
const html = await response.text();
// Extract title
const titleMatch = html.match(/
]*>([^<]+)<\/title>/i);
let title = titleMatch ? titleMatch[1].trim() : `Article from ${new URL(url).hostname}`;
title = title.replace(/\s+/g, ' ').substring(0, 200);
// Extract meta description
const metaDescMatch = html.match(/]*name=["\']description["\'][^>]*content=["\']([^"']+)["\'][^>]*>/i);
let summary = metaDescMatch ? metaDescMatch[1].trim() : `Article scraped from ${new URL(url).hostname}`;
summary = summary.replace(/\s+/g, ' ').substring(0, 500);
// Create basic body content
let body = `This article was originally published at ${url}.\n\n${summary}`;
if (body.length < 200) {
body += `\n\nThis content provides insights and analysis on current industry developments and trends.`;
}
return { title, summary, body, url };
} catch (error) {
console.error(`Error scraping ${url}:`, error.message);
return null;
}
}
async function addArticleToStorage(outletId, articleData) {
try {
const response = await fetch('http://localhost:5000/api/articles', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
outletId,
title: articleData.title,
summary: articleData.summary,
body: articleData.body,
thumbnail: '/api/assets/default-article.png',
publishedAt: new Date().toISOString(),
tags: [],
viewCount: 0
})
});
if (response.ok) {
const result = await response.json();
console.log(`ā Added article: ${articleData.title}`);
return result;
} else {
console.error(`ā Failed to add article: ${response.status}`);
return null;
}
} catch (error) {
console.error(`ā Error adding article:`, error.message);
return null;
}
}
async function main() {
console.log('Starting article scraping...\n');
for (const [outletId, urls] of Object.entries(outletsAndUrls)) {
console.log(`\n=== Scraping articles for ${outletId} ===`);
for (const url of urls.slice(0, 3)) { // Limit to 3 articles per outlet
console.log(`Scraping: ${url}`);
const articleData = await scrapeArticle(url);
if (articleData) {
await addArticleToStorage(outletId, articleData);
}
// Small delay to be respectful
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
console.log('\nā Scraping completed!');
}
main().catch(console.error);