Files
sapiens-mobile/server/outletParser.ts
jungwoo choi 919afe56f2 feat: SAPIENS Mobile App - Initial commit
React Native mobile application for SAPIENS news platform.
Consolidated all previous history into single commit.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-23 14:30:25 +09:00

315 lines
11 KiB
TypeScript

import { readFileSync } from 'fs';
import { join } from 'path';
export interface OutletLinks {
name: string;
category: 'people' | 'topics' | 'companies';
focusSubject: string;
urls: string[];
}
export interface ParsedOutlets {
people: OutletLinks[];
topics: OutletLinks[];
companies: OutletLinks[];
total: number;
}
export class OutletParser {
// Parse the attached file and return structured outlet data
static parseOutletFile(filePath: string): ParsedOutlets {
try {
console.log(`Parsing outlet file: ${filePath}`);
const content = readFileSync(filePath, 'utf-8');
const lines = content.split('\n').map(line => line.trim()).filter(line => line);
const parsed: ParsedOutlets = {
people: [],
topics: [],
companies: [],
total: 0,
};
let currentCategory: 'people' | 'topics' | 'companies' | null = null;
let currentOutlet: OutletLinks | null = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Skip empty lines
if (!line) continue;
// Detect section headers FIRST (before skipping other # lines)
if (line.includes('## People')) {
currentCategory = 'people';
continue;
} else if (line.includes('## Topics')) {
currentCategory = 'topics';
continue;
} else if (line.includes('## Companies') || line.startsWith('📋 Companies')) {
currentCategory = 'companies';
continue;
}
// Skip other markdown headers (after section detection)
if (line.startsWith('#') && !line.startsWith('###')) continue;
// Parse outlet headers like "### 1. Ala Shaabana - Bittensor 공동창립자"
if (line.startsWith('###') && currentCategory) {
// Save previous outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
// Extract outlet name (remove ### and number)
const nameMatch = line.match(/###\s*\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
if (nameMatch) {
const rawName = nameMatch[1].trim();
const cleanedName = this.cleanOutletName(rawName);
currentOutlet = {
name: cleanedName,
category: currentCategory,
focusSubject: this.generateFocusSubject(cleanedName),
urls: []
};
}
continue;
}
// Parse numbered URLs like "1. https://example.com"
const urlMatch = line.match(/^\d+\.\s*(https?:\/\/.+)$/);
if (urlMatch && currentOutlet) {
currentOutlet.urls.push(urlMatch[1]);
continue;
}
// Parse direct URLs for companies section
if (line.startsWith('http://') || line.startsWith('https://')) {
if (currentOutlet) {
currentOutlet.urls.push(line);
}
continue;
}
// Parse company entries like "1. Ava Labs (Avalanche 플랫폼)"
if (currentCategory === 'companies' && /^\d+\.\s*[A-Za-z]/.test(line) && !line.startsWith('http')) {
// Save previous outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
const companyMatch = line.match(/^\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
if (companyMatch) {
const rawName = companyMatch[1].trim();
const cleanedName = this.cleanOutletName(rawName);
currentOutlet = {
name: cleanedName,
category: 'companies',
focusSubject: this.generateFocusSubject(cleanedName),
urls: []
};
}
}
}
// Don't forget the last outlet
if (currentOutlet && currentOutlet.urls.length > 0) {
parsed[currentOutlet.category].push(currentOutlet);
parsed.total++;
}
console.log(`Successfully parsed ${parsed.total} outlets:`);
console.log(`- People: ${parsed.people.length}`);
console.log(`- Topics: ${parsed.topics.length}`);
console.log(`- Companies: ${parsed.companies.length}`);
return parsed;
} catch (error: any) {
console.error('Error parsing outlet file:', error.message);
throw new Error(`Failed to parse outlet file: ${error.message}`);
}
}
// Clean outlet names by removing Korean descriptions and normalizing
private static cleanOutletName(rawName: string): string {
// Remove Korean parenthetical descriptions like "(연방준비제도 의장)" or "(OSS Capital 창립자)"
let cleaned = rawName.replace(/\s*\([^)]*\)/g, '').trim();
// Handle special cases
const specialCases: { [key: string]: string } = {
'CBDC': 'Central Bank Digital Currency',
'CFTC': 'Commodity Futures Trading Commission',
'SEC': 'Securities and Exchange Commission',
'DAT': 'Digital Asset Treasury',
'DeFi': 'Decentralized Finance',
'DEX': 'Decentralized Exchange',
'NFT': 'Non-Fungible Token',
'RWA': 'Real World Assets',
'SWF': 'Sovereign Wealth Fund',
};
return specialCases[cleaned] || cleaned;
}
// Generate focus subject for database compatibility
private static generateFocusSubject(rawName: string): string {
let subject = rawName.replace(/\s*\([^)]*\)/g, '').trim();
// Convert to lowercase and replace spaces with dashes for ID compatibility
return subject.toLowerCase()
.replace(/[^a-z0-9\s-]/g, '') // Remove special characters
.replace(/\s+/g, '-') // Replace spaces with dashes
.replace(/--+/g, '-') // Replace multiple dashes with single dash
.replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
}
// Categorize outlet based on name
private static categorizeOutlet(name: string): 'people' | 'topics' | 'companies' {
const cleanName = name.toLowerCase().trim();
// People (individual names)
const people = [
'ala shaabana', 'alex karp', 'arthur hayes', 'donald trump jr', 'eric trump',
'jacob robert steeves', 'jared kushner', 'j.d. vance', 'jensen huang',
'jerome powell', 'joseph jacks', 'robert myers', 'yat siu'
];
// Companies
const companies = [
'xtao', 'yuma', 'taox', 'oblong', 'ava labs', 'boston dynamics',
'blackrock', 'chainlink', 'circle', 'cme group', 'manifold labs'
];
// Check for exact matches first
if (people.some(person => cleanName.includes(person) || person.includes(cleanName))) {
return 'people';
}
if (companies.some(company => cleanName.includes(company) || company.includes(cleanName))) {
return 'companies';
}
// Everything else goes to topics
return 'topics';
}
// Get specific outlet data by name
static getOutletByName(parsed: ParsedOutlets, name: string): OutletLinks | null {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.find(outlet =>
outlet.name.toLowerCase() === name.toLowerCase() ||
outlet.focusSubject === name
) || null;
}
// Get all URLs from parsed data
static getAllUrls(parsed: ParsedOutlets): string[] {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.flatMap(outlet => outlet.urls);
}
// Get URLs by category
static getUrlsByCategory(parsed: ParsedOutlets, category: 'people' | 'topics' | 'companies'): string[] {
return parsed[category].flatMap(outlet => outlet.urls);
}
// Convert parsed data to our existing outlet format
static convertToOutletFormat(parsed: ParsedOutlets): Array<{
id: string;
name: string;
description: string;
category: string;
focusSubject: string;
avatar?: string;
profileImage?: string;
bio: string;
fullBio?: string[];
urls: string[];
}> {
const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
return allOutlets.map(outlet => ({
id: outlet.focusSubject,
name: outlet.name,
description: this.generateDescription(outlet),
category: outlet.category,
focusSubject: outlet.focusSubject,
avatar: this.getDefaultAvatar(outlet.category),
profileImage: this.getDefaultProfileImage(outlet.category),
bio: this.generateBio(outlet),
fullBio: this.generateFullBio(outlet),
urls: outlet.urls,
}));
}
private static generateDescription(outlet: OutletLinks): string {
const descriptions = {
people: `Latest news and analysis about ${outlet.name}`,
topics: `Comprehensive coverage of ${outlet.name} developments and trends`,
companies: `${outlet.name} news, updates, and market analysis`,
};
return descriptions[outlet.category];
}
private static getDefaultAvatar(category: string): string {
const avatars = {
people: '/api/assets/default-person.jpg',
topics: '/api/assets/default-topic.jpg',
companies: '/api/assets/default-company.jpg',
};
return avatars[category as keyof typeof avatars] || avatars.topics;
}
private static getDefaultProfileImage(category: string): string {
return this.getDefaultAvatar(category);
}
private static generateBio(outlet: OutletLinks): string {
const bios = {
people: `${outlet.name} is a prominent figure in technology and business, making headlines with strategic decisions and market insights.`,
topics: `Stay informed about the latest developments in ${outlet.name} with comprehensive coverage and expert analysis.`,
companies: `${outlet.name} continues to shape the industry with innovative solutions and strategic partnerships.`,
};
return bios[outlet.category];
}
private static generateFullBio(outlet: OutletLinks): string[] {
const fullBios = {
people: [
`${outlet.name} is a key figure in the technology and business landscape.`,
`Known for strategic leadership and innovative thinking in their field.`,
`Continues to influence industry trends and developments globally.`
],
topics: [
`${outlet.name} represents a critical area of technological advancement.`,
`Dynamic sector with ongoing market trends, regulatory updates, and innovations.`,
`Comprehensive resource requiring expert analysis from leading industry professionals.`
],
companies: [
`${outlet.name} is a significant player in the technology industry.`,
`Known for innovative products and strategic market positioning.`,
`Continues to drive industry growth and technological advancement.`
]
};
return fullBios[outlet.category];
}
}
// Utility function to parse the specific file
export function parseAttachedOutletFile(): ParsedOutlets {
const filePath = join(process.cwd(), 'attached_assets', 'Pasted-Ala-Shaabana-https-www-rootdata-com-news-323625-https-ffnews-com-newsarticle-funding-xtao-tsx-v-1758557992922_1758557992922.txt');
return OutletParser.parseOutletFile(filePath);
}
export default OutletParser;