sapiens-mobile/server/outletParser.ts

import { readFileSync } from 'fs';
import { join } from 'path';

export interface OutletLinks {
  name: string;
  category: 'people' | 'topics' | 'companies';
  focusSubject: string;
  urls: string[];
}

export interface ParsedOutlets {
  people: OutletLinks[];
  topics: OutletLinks[];
  companies: OutletLinks[];
  total: number;
}

export class OutletParser {

  // Parse the attached file and return structured outlet data
  static parseOutletFile(filePath: string): ParsedOutlets {
    try {
      console.log(`Parsing outlet file: ${filePath}`);

      const content = readFileSync(filePath, 'utf-8');
      const lines = content.split('\n').map(line => line.trim()).filter(line => line);

      const parsed: ParsedOutlets = {
        people: [],
        topics: [],
        companies: [],
        total: 0,
      };

      let currentCategory: 'people' | 'topics' | 'companies' | null = null;
      let currentOutlet: OutletLinks | null = null;

      for (let i = 0; i < lines.length; i++) {
        const line = lines[i];

        // Skip empty lines
        if (!line) continue;

        // Detect section headers FIRST (before skipping other # lines)
        if (line.includes('## People')) {
          currentCategory = 'people';
          continue;
        } else if (line.includes('## Topics')) {
          currentCategory = 'topics';
          continue;
        } else if (line.includes('## Companies') || line.startsWith('📋 Companies')) {
          currentCategory = 'companies';
          continue;
        }

        // Skip other markdown headers (after section detection)
        if (line.startsWith('#') && !line.startsWith('###')) continue;

        // Parse outlet headers like "### 1. Ala Shaabana - Bittensor 공동창립자"
        if (line.startsWith('###') && currentCategory) {
          // Save previous outlet
          if (currentOutlet && currentOutlet.urls.length > 0) {
            parsed[currentOutlet.category].push(currentOutlet);
            parsed.total++;
          }

          // Extract outlet name (remove ### and number)
          const nameMatch = line.match(/###\s*\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
          if (nameMatch) {
            const rawName = nameMatch[1].trim();
            const cleanedName = this.cleanOutletName(rawName);

            currentOutlet = {
              name: cleanedName,
              category: currentCategory,
              focusSubject: this.generateFocusSubject(cleanedName),
              urls: []
            };
          }
          continue;
        }

        // Parse numbered URLs like "1. https://example.com"
        const urlMatch = line.match(/^\d+\.\s*(https?:\/\/.+)$/);
        if (urlMatch && currentOutlet) {
          currentOutlet.urls.push(urlMatch[1]);
          continue;
        }

        // Parse direct URLs for companies section
        if (line.startsWith('http://') || line.startsWith('https://')) {
          if (currentOutlet) {
            currentOutlet.urls.push(line);
          }
          continue;
        }

        // Parse company entries like "1. Ava Labs (Avalanche 플랫폼)"
        if (currentCategory === 'companies' && /^\d+\.\s*[A-Za-z]/.test(line) && !line.startsWith('http')) {
          // Save previous outlet
          if (currentOutlet && currentOutlet.urls.length > 0) {
            parsed[currentOutlet.category].push(currentOutlet);
            parsed.total++;
          }

          const companyMatch = line.match(/^\d+\.\s*(.+?)(?:\s*\([^)]*\))?$/);
          if (companyMatch) {
            const rawName = companyMatch[1].trim();
            const cleanedName = this.cleanOutletName(rawName);

            currentOutlet = {
              name: cleanedName,
              category: 'companies',
              focusSubject: this.generateFocusSubject(cleanedName),
              urls: []
            };
          }
        }
      }

      // Don't forget the last outlet
      if (currentOutlet && currentOutlet.urls.length > 0) {
        parsed[currentOutlet.category].push(currentOutlet);
        parsed.total++;
      }

      console.log(`Successfully parsed ${parsed.total} outlets:`);
      console.log(`- People: ${parsed.people.length}`);
      console.log(`- Topics: ${parsed.topics.length}`);
      console.log(`- Companies: ${parsed.companies.length}`);

      return parsed;

    } catch (error: any) {
      console.error('Error parsing outlet file:', error.message);
      throw new Error(`Failed to parse outlet file: ${error.message}`);
    }
  }

  // Clean outlet names by removing Korean descriptions and normalizing
  private static cleanOutletName(rawName: string): string {
    // Remove Korean parenthetical descriptions like "(연방준비제도 의장)" or "(OSS Capital 창립자)"
    let cleaned = rawName.replace(/\s*\([^)]*\)/g, '').trim();

    // Handle special cases
    const specialCases: { [key: string]: string } = {
      'CBDC': 'Central Bank Digital Currency',
      'CFTC': 'Commodity Futures Trading Commission',
      'SEC': 'Securities and Exchange Commission',
      'DAT': 'Digital Asset Treasury',
      'DeFi': 'Decentralized Finance',
      'DEX': 'Decentralized Exchange',
      'NFT': 'Non-Fungible Token',
      'RWA': 'Real World Assets',
      'SWF': 'Sovereign Wealth Fund',
    };

    return specialCases[cleaned] || cleaned;
  }

  // Generate focus subject for database compatibility
  private static generateFocusSubject(rawName: string): string {
    let subject = rawName.replace(/\s*\([^)]*\)/g, '').trim();

    // Convert to lowercase and replace spaces with dashes for ID compatibility
    return subject.toLowerCase()
      .replace(/[^a-z0-9\s-]/g, '') // Remove special characters
      .replace(/\s+/g, '-') // Replace spaces with dashes
      .replace(/--+/g, '-') // Replace multiple dashes with single dash
      .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
  }

  // Categorize outlet based on name
  private static categorizeOutlet(name: string): 'people' | 'topics' | 'companies' {
    const cleanName = name.toLowerCase().trim();

    // People (individual names)
    const people = [
      'ala shaabana', 'alex karp', 'arthur hayes', 'donald trump jr', 'eric trump',
      'jacob robert steeves', 'jared kushner', 'j.d. vance', 'jensen huang',
      'jerome powell', 'joseph jacks', 'robert myers', 'yat siu'
    ];

    // Companies
    const companies = [
      'xtao', 'yuma', 'taox', 'oblong', 'ava labs', 'boston dynamics',
      'blackrock', 'chainlink', 'circle', 'cme group', 'manifold labs'
    ];

    // Check for exact matches first
    if (people.some(person => cleanName.includes(person) || person.includes(cleanName))) {
      return 'people';
    }

    if (companies.some(company => cleanName.includes(company) || company.includes(cleanName))) {
      return 'companies';
    }

    // Everything else goes to topics
    return 'topics';
  }

  // Get specific outlet data by name
  static getOutletByName(parsed: ParsedOutlets, name: string): OutletLinks | null {
    const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
    return allOutlets.find(outlet =>
      outlet.name.toLowerCase() === name.toLowerCase() ||
      outlet.focusSubject === name
    ) || null;
  }

  // Get all URLs from parsed data
  static getAllUrls(parsed: ParsedOutlets): string[] {
    const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];
    return allOutlets.flatMap(outlet => outlet.urls);
  }

  // Get URLs by category
  static getUrlsByCategory(parsed: ParsedOutlets, category: 'people' | 'topics' | 'companies'): string[] {
    return parsed[category].flatMap(outlet => outlet.urls);
  }

  // Convert parsed data to our existing outlet format
  static convertToOutletFormat(parsed: ParsedOutlets): Array<{
    id: string;
    name: string;
    description: string;
    category: string;
    focusSubject: string;
    avatar?: string;
    profileImage?: string;
    bio: string;
    fullBio?: string[];
    urls: string[];
  }> {
    const allOutlets = [...parsed.people, ...parsed.topics, ...parsed.companies];

    return allOutlets.map(outlet => ({
      id: outlet.focusSubject,
      name: outlet.name,
      description: this.generateDescription(outlet),
      category: outlet.category,
      focusSubject: outlet.focusSubject,
      avatar: this.getDefaultAvatar(outlet.category),
      profileImage: this.getDefaultProfileImage(outlet.category),
      bio: this.generateBio(outlet),
      fullBio: this.generateFullBio(outlet),
      urls: outlet.urls,
    }));
  }

  private static generateDescription(outlet: OutletLinks): string {
    const descriptions = {
      people: `Latest news and analysis about ${outlet.name}`,
      topics: `Comprehensive coverage of ${outlet.name} developments and trends`,
      companies: `${outlet.name} news, updates, and market analysis`,
    };

    return descriptions[outlet.category];
  }

  private static getDefaultAvatar(category: string): string {
    const avatars = {
      people: '/api/assets/default-person.jpg',
      topics: '/api/assets/default-topic.jpg',
      companies: '/api/assets/default-company.jpg',
    };

    return avatars[category as keyof typeof avatars] || avatars.topics;
  }

  private static getDefaultProfileImage(category: string): string {
    return this.getDefaultAvatar(category);
  }

  private static generateBio(outlet: OutletLinks): string {
    const bios = {
      people: `${outlet.name} is a prominent figure in technology and business, making headlines with strategic decisions and market insights.`,
      topics: `Stay informed about the latest developments in ${outlet.name} with comprehensive coverage and expert analysis.`,
      companies: `${outlet.name} continues to shape the industry with innovative solutions and strategic partnerships.`,
    };

    return bios[outlet.category];
  }

  private static generateFullBio(outlet: OutletLinks): string[] {
    const fullBios = {
      people: [
        `${outlet.name} is a key figure in the technology and business landscape.`,
        `Known for strategic leadership and innovative thinking in their field.`,
        `Continues to influence industry trends and developments globally.`
      ],
      topics: [
        `${outlet.name} represents a critical area of technological advancement.`,
        `Dynamic sector with ongoing market trends, regulatory updates, and innovations.`,
        `Comprehensive resource requiring expert analysis from leading industry professionals.`
      ],
      companies: [
        `${outlet.name} is a significant player in the technology industry.`,
        `Known for innovative products and strategic market positioning.`,
        `Continues to drive industry growth and technological advancement.`
      ]
    };

    return fullBios[outlet.category];
  }
}

// Utility function to parse the specific file
export function parseAttachedOutletFile(): ParsedOutlets {
  const filePath = join(process.cwd(), 'attached_assets', 'Pasted-Ala-Shaabana-https-www-rootdata-com-news-323625-https-ffnews-com-newsarticle-funding-xtao-tsx-v-1758557992922_1758557992922.txt');
  return OutletParser.parseOutletFile(filePath);
}

export default OutletParser;