import { chromium } from 'playwright' import readlineSync from 'readline-sync' import { ensureDataFiles, readYearData, writeYearData, writeStatus } from './lib/cache.js' const PAGE_URL = 'https://www1.gdtv.cn/tvColumn/768' function parseEpisodeItem(rawItem) { try { const dataObj = typeof rawItem.data === 'string' ? JSON.parse(rawItem.data) : rawItem.data const videoObj = typeof dataObj.videoUrl === 'string' ? JSON.parse(dataObj.videoUrl) : dataObj.videoUrl return { id: dataObj.id || rawItem.id, title: dataObj.title || '', coverUrl: dataObj.coverUrl || '', releasedAt: dataObj.releasedAt || 0, timeLength: dataObj.timeLength || 0, videoUrl: videoObj?.hd || videoObj?.sd || '', raw: dataObj } } catch (error) { console.error('解析单条节目失败:', error) return null } } function getYearByTimestamp(timestamp) { if (!timestamp) return new Date().getFullYear() return new Date(timestamp).getFullYear() } function saveItemsByYear(items) { const yearGroups = new Map() for (const item of items) { const year = getYearByTimestamp(item.releasedAt) if (!yearGroups.has(year)) { yearGroups.set(year, []) } yearGroups.get(year).push(item) } let totalUniqueAdded = 0 for (const [year, groupItems] of yearGroups.entries()) { const oldData = readYearData(year) const itemMap = new Map((oldData.items || []).map(item => [item.id, item])) const beforeCount = itemMap.size for (const item of groupItems) { itemMap.set(item.id, item) } const nextItems = Array.from(itemMap.values()).sort((a, b) => (b.releasedAt || 0) - (a.releasedAt || 0)) const afterCount = nextItems.length totalUniqueAdded += afterCount - beforeCount writeYearData(year, { year, updatedAt: new Date().toISOString(), items: nextItems }) } return totalUniqueAdded } function log(...args) { console.log('[capture]', ...args) } async function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)) } async function getPageScrollInfo(page) { return await page.evaluate(() => { return { scrollTop: window.scrollY || document.documentElement.scrollTop || document.body.scrollTop || 0, scrollHeight: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight), innerHeight: window.innerHeight } }) } async function scrollStep(page, distance = 700) { await page.evaluate((step) => { window.scrollBy(0, step) }, distance) } async function findLoadMoreButton(page) { const selectors = [ '.index__load-component___1Ht2U button', 'button:has-text("点击加载更多")', 'button:has-text("加载更多")', '.ant-btn:has-text("点击加载更多")' ] for (const selector of selectors) { const locator = page.locator(selector).first() const count = await page.locator(selector).count().catch(() => 0) if (!count) continue const visible = await locator.isVisible().catch(() => false) if (visible) { return locator } } return null } async function getDomItemCount(page) { return await page.evaluate(() => { const selectors = [ '.index__item-component___1q1ob', '.index__list-item', '.ant-list-item', 'li' ] for (const selector of selectors) { const count = document.querySelectorAll(selector).length if (count > 0) return count } return 0 }).catch(() => 0) } async function clickLoadMoreAndWait(page) { const button = await findLoadMoreButton(page) if (!button) return false const responsePromise = page.waitForResponse( (response) => response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') && response.request().method() === 'GET' && response.status() === 200, { timeout: 15000 } ).catch(() => null) await button.scrollIntoViewIfNeeded().catch(() => {}) await sleep(800) await button.click({ timeout: 10000 }).catch(async () => { await button.click({ force: true, timeout: 10000 }) }) const response = await responsePromise await sleep(1800) return !!response } function askContinue() { const answer = readlineSync.question('\n页面已滑到底部,继续采集吗?(y/n): ') return answer.trim().toLowerCase() === 'y' } async function autoCollectByScrollAndClick(page, maxRounds = 150) { let round = 0 let noChangeRounds = 0 let lastDomCount = await getDomItemCount(page) let lastScrollTop = 0 while (round < maxRounds) { round += 1 const button = await findLoadMoreButton(page) if (button) { log(`第 ${round} 轮:发现“加载更多”按钮,准备点击`) const clicked = await clickLoadMoreAndWait(page) const currentDomCount = await getDomItemCount(page) if (clicked) { log(`第 ${round} 轮:点击成功,当前列表数量 ${currentDomCount}`) } else { log(`第 ${round} 轮:点击后未捕获到新响应`) } if (currentDomCount <= lastDomCount) { noChangeRounds += 1 } else { noChangeRounds = 0 lastDomCount = currentDomCount } await sleep(1200) continue } await scrollStep(page, 700) await sleep(1400) const { scrollTop, scrollHeight, innerHeight } = await getPageScrollInfo(page) const currentDomCount = await getDomItemCount(page) log( `第 ${round} 轮:继续下滑,scrollTop=${scrollTop},scrollHeight=${scrollHeight},items=${currentDomCount}` ) if (currentDomCount <= lastDomCount && scrollTop === lastScrollTop) { noChangeRounds += 1 } else { if (currentDomCount > lastDomCount) { lastDomCount = currentDomCount } noChangeRounds = 0 } lastScrollTop = scrollTop const nearBottom = scrollTop + innerHeight >= scrollHeight - 80 if (nearBottom) { log(`第 ${round} 轮:已经接近页面底部`) await sleep(1800) const retryButton = await findLoadMoreButton(page) if (retryButton) { log('到底部后重新检测到“加载更多”按钮,继续点击') continue } const shouldContinue = askContinue() if (!shouldContinue) { log('你选择结束采集') break } log('你选择继续采集,尝试再次下滑检测') noChangeRounds = 0 await sleep(1000) } if (noChangeRounds >= 6) { const shouldContinue = askContinue() if (!shouldContinue) { log('连续多轮没有新内容,且你选择结束采集') break } log('你选择继续采集,重置无变化计数') noChangeRounds = 0 } } log(`自动采集流程结束,共执行 ${round} 轮`) } async function main() { ensureDataFiles() writeStatus({ running: true, lastMessage: '正在启动浏览器采集', updatedAt: new Date().toISOString() }) const browser = await chromium.launchPersistentContext('./.pw-user-data', { headless: false }) const page = await browser.newPage() page.on('response', async (response) => { const url = response.url() if (!url.includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news')) return if (response.status() !== 200) return try { const data = await response.json() const parsedItems = (data.list || []) .map(parseEpisodeItem) .filter(Boolean) const addedCount = saveItemsByYear(parsedItems) const totalItems = parsedItems.length const currentPage = new URL(url).searchParams.get('currentPage') writeStatus({ running: true, lastMessage: `已采集第 ${currentPage} 页`, updatedAt: new Date().toISOString(), currentPage, pageItems: totalItems, addedCount }) log(`采集成功: page=${currentPage} 本页=${totalItems} 新增唯一=${addedCount}`) } catch (error) { console.error('[capture] 解析响应失败:', error) } }) await page.goto(PAGE_URL, { waitUntil: 'domcontentloaded' }) await page.waitForTimeout(5000) log('已打开页面:', PAGE_URL) log('等待第一页接口加载...') await page.waitForResponse( (response) => response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') && response.request().method() === 'GET' && response.status() === 200, { timeout: 15000 } ).catch(() => null) await sleep(2500) writeStatus({ running: true, lastMessage: '开始自动下滑并检测加载更多按钮', updatedAt: new Date().toISOString() }) await autoCollectByScrollAndClick(page, 150) writeStatus({ running: false, lastMessage: '自动采集完成', updatedAt: new Date().toISOString() }) log('自动采集完成,数据已按年份写入 data/*.json') } main().catch((error) => { console.error('采集启动失败:', error) writeStatus({ running: false, lastMessage: `采集启动失败: ${error.message}`, updatedAt: new Date().toISOString() }) })