zeropost-engine/src/services/fromUrl.js

/**
 * fromUrl.js — парсинг URL и генерация поста по содержимому страницы.
 *
 * Поддерживаемые источники:
 *   1. Любая веб-страница — cheerio, og-meta + основной текст
 *   2. YouTube — title + description (без транскрипта, yt-dlp не нужен)
 *   3. t.me публичный пост — текст сообщения
 */

const axios  = require('axios');
const cheerio = require('cheerio');
const ai     = require('./ai');
const pb     = require('./promptBuilder');

const FETCH_TIMEOUT = 12_000;
const MAX_TEXT_LEN  = 4000; // лимит текста для промта

// ── Парсеры ───────────────────────────────────────────────────────────────────

/**
 * YouTube: title + description из yt-initial-data или og-meta
 */
async function parseYoutube(url) {
  const res = await axios.get(url, {
    timeout: FETCH_TIMEOUT,
    headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1)' },
    maxRedirects: 5,
  });
  const $ = cheerio.load(res.data);

  const title = $('meta[name="title"]').attr('content')
    || $('meta[property="og:title"]').attr('content')
    || $('title').text();

  const description = $('meta[name="description"]').attr('content')
    || $('meta[property="og:description"]').attr('content')
    || '';

  // Пробуем вытащить chapters / chapters из начальных данных
  let chapters = '';
  const dataMatch = res.data.match(/"chapters":\s*\[([^\]]{1,3000})\]/);
  if (dataMatch) {
    try {
      const arr = JSON.parse('[' + dataMatch[1] + ']');
      chapters = arr.map(c => c.title?.simpleText || '').filter(Boolean).join(', ');
    } catch {}
  }

  const imageUrl = $('meta[property="og:image"]').attr('content') || null;

  const text = [title, description, chapters ? `Главы: ${chapters}` : '']
    .filter(Boolean).join('\n\n').slice(0, MAX_TEXT_LEN);

  return { title, text, imageUrl, source: 'youtube' };
}

/**
 * t.me публичный пост (embed)
 */
async function parseTelegram(url) {
  // Конвертируем t.me/channel/123 → embed
  const embedUrl = url.replace('https://t.me/', 'https://t.me/') + '?embed=1&mode=tme';
  const res = await axios.get(embedUrl, {
    timeout: FETCH_TIMEOUT,
    headers: { 'User-Agent': 'Mozilla/5.0' },
  });
  const $ = cheerio.load(res.data);

  const text = $('.tgme_widget_message_text').text().trim()
    || $('meta[property="og:description"]').attr('content')
    || '';

  const title = $('meta[property="og:title"]').attr('content') || 'Telegram пост';
  const imageUrl = $('meta[property="og:image"]').attr('content') || null;

  return { title, text: text.slice(0, MAX_TEXT_LEN), imageUrl, source: 'telegram' };
}

/**
 * Универсальная веб-страница
 */
async function parseWeb(url) {
  const res = await axios.get(url, {
    timeout: FETCH_TIMEOUT,
    headers: {
      'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
      'Accept-Language': 'ru-RU,ru;q=0.9,en;q=0.8',
    },
    maxRedirects: 5,
  });

  const $ = cheerio.load(res.data);

  // Убираем мусор
  $('script, style, nav, footer, header, aside, form, .cookie, .banner, .popup, .ad').remove();

  const title = $('meta[property="og:title"]').attr('content')
    || $('meta[name="title"]').attr('content')
    || $('h1').first().text().trim()
    || $('title').text().trim();

  const description = $('meta[property="og:description"]').attr('content')
    || $('meta[name="description"]').attr('content')
    || '';

  const imageUrl = $('meta[property="og:image"]').attr('content') || null;

  // Основной текст: article > p, или просто все параграфы
  const paragraphs = [];
  const container = $('article, main, .content, .post, .entry, [role="main"]').first();
  const source = container.length ? container : $('body');

  source.find('p, h2, h3, li').each((_, el) => {
    const t = $(el).text().trim();
    if (t.length > 40) paragraphs.push(t);
  });

  const bodyText = paragraphs.join('\n').slice(0, MAX_TEXT_LEN);

  const text = [description, bodyText].filter(Boolean).join('\n\n').slice(0, MAX_TEXT_LEN);

  return { title, text, imageUrl, source: 'web' };
}

// ── Роутер источников ─────────────────────────────────────────────────────────

async function parseUrl(url) {
  try {
    const u = new URL(url);
    if (u.hostname.includes('youtube.com') || u.hostname.includes('youtu.be')) {
      return await parseYoutube(url);
    }
    if (u.hostname === 't.me') {
      return await parseTelegram(url);
    }
    return await parseWeb(url);
  } catch (err) {
    throw new Error(`Не удалось загрузить страницу: ${err.message}`);
  }
}

// ── Генерация поста по распарсенному контенту ─────────────────────────────────

async function generateFromUrl({ url, channelId, channel }) {
  if (!url) throw new Error('url required');
  if (!channel) throw new Error('channel required');

  // 1. Парсим страницу
  const parsed = await parseUrl(url);

  if (!parsed.text && !parsed.title) {
    throw new Error('Не удалось извлечь текст со страницы');
  }

  // 2. Строим промт
  const channelContext = pb.buildPostSystemPrompt(channel, '');

  const userPrompt = `На основе материала ниже напиши пост для Telegram-канала в стиле этого канала.

ИСТОЧНИК: ${parsed.source === 'youtube' ? 'YouTube-видео' : parsed.source === 'telegram' ? 'Telegram-пост' : 'Веб-статья'}
URL: ${url}

ЗАГОЛОВОК:
${parsed.title || '—'}

СОДЕРЖАНИЕ:
${parsed.text || '(текст не извлечён, опирайся на заголовок)'}

---

ЗАДАЧА:
— Напиши пост в стиле и голосе канала
— Передай суть материала своими словами, не пересказывай дословно
— Добавь свой угол зрения или вывод
— Длина поста: 150–500 символов
— Верни ТОЛЬКО текст поста, без пояснений`;

  // 3. Генерируем
  const result = await ai.chat(
    require('../config').ai.models.post,
    channelContext,
    userPrompt,
    { maxTokens: 1000, temperature: 0.85 }
  );

  return {
    content:  result.text,
    title:    parsed.title,
    imageUrl: parsed.imageUrl,
    source:   parsed.source,
    usage:    result.usage,
  };
}

module.exports = { generateFromUrl, parseUrl };