import { readFileSync } from 'fs'; import { join } from 'path'; export interface OutletLinks { name: string; category: 'people' | 'topics' | 'companies'; focusSubject: string; urls: string[]; } export interface ParsedOutlets { people: OutletLinks[]; topics: OutletLinks[]; companies: OutletLinks[]; total: number; } export class OutletParser { // Parse the attached file and return structured outlet data static parseOutletFile(filePath: string): ParsedOutlets { try { console.log(`Parsing outlet file: ${filePath}`); const content = readFileSync(filePath, 'utf-8'); const lines = content.split('\n').map(line => line.trim()).filter(line => line); const parsed: ParsedOutlets = { people: [], topics: [], companies: [], total: 0, }; let currentCategory: 'people' | 'topics' | 'companies' | null = null; let currentOutlet: OutletLinks | null = null; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Skip empty lines if (!line) continue; // Detect section headers FIRST (before skipping other # lines) if (line.includes('## People')) { currentCategory = 'people'; continue; } else if (line.includes('## Topics')) { currentCategory = 'topics'; continue; } else if (line.includes('## Companies') || line.startsWith('📋 Companies')) { currentCategory = 'companies'; continue; } // Skip other markdown headers (after section detection) if (line.startsWith('#') && !line.startsWith('###')) continue; // Parse outlet headers like "### 1. Ala Shaabana - Bittensor 공동창립자" if (line.startsWith('###') && currentCategory) { // Save previous outlet if (currentOutlet && currentOutlet.urls.length > 0) { parsed[currentOutlet.category].push(currentOutlet); parsed.total++; } // Extract outlet name (remove ### and number) const nameMatch = line.match(/###\s*\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/); if (nameMatch) { const rawName = nameMatch[1].trim(); const cleanedName = this.cleanOutletName(rawName); currentOutlet = { name: cleanedName, category: currentCategory, focusSubject: this.generateFocusSubject(cleanedName), urls: [] }; } continue; } // Parse numbered URLs like "1. https://example.com" const urlMatch = line.match(/^\d+\.\s*(https?:\/\/.+)$/); if (urlMatch && currentOutlet) { currentOutlet.urls.push(urlMatch[1]); continue; } // Parse direct URLs for companies section if (line.startsWith('http://') || line.startsWith('https://')) { if (currentOutlet) { currentOutlet.urls.push(line); } continue; } // Parse company entries like "1. Ava Labs (Avalanche 플랫폼)" if (currentCategory === 'companies' && /^\d+\.\s*[A-Za-z]/.test(line) && !line.startsWith('http')) { // Save previous outlet if (currentOutlet && currentOutlet.urls.length > 0) { parsed[currentOutlet.category].push(currentOutlet); parsed.total++; } const companyMatch = line.match(/^\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/); if (companyMatch) { const rawName = companyMatch[1].trim(); const cleanedName = this.cleanOutletName(rawName); currentOutlet = { name: cleanedName, category: 'companies', focusSubject: this.generateFocusSubject(cleanedName), urls: [] }; } } } // Don't forget the last outlet if (currentOutlet && currentOutlet.urls.length > 0) { parsed[currentOutlet.category].push(currentOutlet); parsed.total++; } console.log(`Successfully parsed ${parsed.total} outlets:`); console.log(`- People: ${parsed.people.length}`); console.log(`- Topics: ${parsed.topics.length}`); console.log(`- Companies: ${parsed.companies.length}`); return parsed; } catch (error: any) { console.error('Error parsing outlet file:', error.message); throw new Error(`Failed to parse outlet file: ${error.message}`); } } // Clean outlet names by removing Korean descriptions and normalizing private static cleanOutletName(rawName: string): string { // Remove Korean parenthetical descriptions like "(연방준비제도 의장)" or "(OSS Capital 창립자)" let cleaned = rawName.replace(/\s*\([^)]*\)/g, '').trim(); // Handle special cases const specialCases: { [key: string]: string } = { 'CBDC': 'Central Bank Digital Currency', 'CFTC': 'Commodity Futures Trading Commission', 'SEC': 'Securities and Exchange Commission', 'DAT': 'Digital Asset Treasury', 'DeFi': 'Decentralized Finance', 'DEX': 'Decentralized Exchange', 'NFT': 'Non-Fungible Token', 'RWA': 'Real World Assets', 'SWF': 'Sovereign Wealth Fund', }; return specialCases[cleaned] || cleaned; } // Generate focus subject for database compatibility private static generateFocusSubject(rawName: string): string { let subject = rawName.replace(/\s*\([^)]*\)/g, '').trim(); // Convert to lowercase and replace spaces with dashes for ID compatibility return subject.toLowerCase() .replace(/[^a-z0-9\s-]/g, '') // Remove special characters .replace(/\s+/g, '-') // Replace spaces with dashes .replace(/--+/g, '-') // Replace multiple dashes with single dash .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes } // Categorize outlet based on name private static categorizeOutlet(name: string): 'people' | 'topics' | 'companies' { const cleanName = name.toLowerCase().trim(); // People (individual names) const people = [ 'ala shaabana', 'alex karp', 'arthur hayes', 'donald trump jr', 'eric trump', 'jacob robert steeves', 'jared kushner', 'j.d. vance', 'jensen huang', 'jerome powell', 'joseph jacks', 'robert myers', 'yat siu' ]; // Companies const companies = [ 'xtao', 'yuma', 'taox', 'oblong', 'ava labs', 'boston dynamics', 'blackrock', 'chainlink', 'circle', 'cme group', 'manifold labs' ]; // Check for exact matches first if (people.some(person => cleanName.includes(person) || person.includes(cleanName))) { return 'people'; } if (companies.some(company => cleanName.includes(company) || company.includes(cleanName))) { return 'companies'; } // Everything else goes to topics return 'topics'; } // Get specific outlet data by name static getOutletByName(parsed: ParsedOutlets, name: string): OutletLinks | null { const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies]; return allOutlets.find(outlet => outlet.name.toLowerCase() === name.toLowerCase() || outlet.focusSubject === name ) || null; } // Get all URLs from parsed data static getAllUrls(parsed: ParsedOutlets): string[] { const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies]; return allOutlets.flatMap(outlet => outlet.urls); } // Get URLs by category static getUrlsByCategory(parsed: ParsedOutlets, category: 'people' | 'topics' | 'companies'): string[] { return parsed[category].flatMap(outlet => outlet.urls); } // Convert parsed data to our existing outlet format static convertToOutletFormat(parsed: ParsedOutlets): Array<{ id: string; name: string; description: string; category: string; focusSubject: string; avatar?: string; profileImage?: string; bio: string; fullBio?: string[]; urls: string[]; }> { const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies]; return allOutlets.map(outlet => ({ id: outlet.focusSubject, name: outlet.name, description: this.generateDescription(outlet), category: outlet.category, focusSubject: outlet.focusSubject, avatar: this.getDefaultAvatar(outlet.category), profileImage: this.getDefaultProfileImage(outlet.category), bio: this.generateBio(outlet), fullBio: this.generateFullBio(outlet), urls: outlet.urls, })); } private static generateDescription(outlet: OutletLinks): string { const descriptions = { people: `Latest news and analysis about ${outlet.name}`, topics: `Comprehensive coverage of ${outlet.name} developments and trends`, companies: `${outlet.name} news, updates, and market analysis`, }; return descriptions[outlet.category]; } private static getDefaultAvatar(category: string): string { const avatars = { people: '/api/assets/default-person.jpg', topics: '/api/assets/default-topic.jpg', companies: '/api/assets/default-company.jpg', }; return avatars[category as keyof typeof avatars] || avatars.topics; } private static getDefaultProfileImage(category: string): string { return this.getDefaultAvatar(category); } private static generateBio(outlet: OutletLinks): string { const bios = { people: `${outlet.name} is a prominent figure in technology and business, making headlines with strategic decisions and market insights.`, topics: `Stay informed about the latest developments in ${outlet.name} with comprehensive coverage and expert analysis.`, companies: `${outlet.name} continues to shape the industry with innovative solutions and strategic partnerships.`, }; return bios[outlet.category]; } private static generateFullBio(outlet: OutletLinks): string[] { const fullBios = { people: [ `${outlet.name} is a key figure in the technology and business landscape.`, `Known for strategic leadership and innovative thinking in their field.`, `Continues to influence industry trends and developments globally.` ], topics: [ `${outlet.name} represents a critical area of technological advancement.`, `Dynamic sector with ongoing market trends, regulatory updates, and innovations.`, `Comprehensive resource requiring expert analysis from leading industry professionals.` ], companies: [ `${outlet.name} is a significant player in the technology industry.`, `Known for innovative products and strategic market positioning.`, `Continues to drive industry growth and technological advancement.` ] }; return fullBios[outlet.category]; } } // Utility function to parse the specific file export function parseAttachedOutletFile(): ParsedOutlets { const filePath = join(process.cwd(), 'attached_assets', 'Pasted-Ala-Shaabana-https-www-rootdata-com-news-323625-https-ffnews-com-newsarticle-funding-xtao-tsx-v-1758557992922_1758557992922.txt'); return OutletParser.parseOutletFile(filePath); } export default OutletParser;