2026-03-11 05:14:48 +08:00

347 lines
9.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { chromium } from 'playwright'
import readlineSync from 'readline-sync'
import {
ensureDataFiles,
readYearData,
writeYearData,
writeStatus
} from './lib/cache.js'
const PAGE_URL = 'https://www1.gdtv.cn/tvColumn/768'
function parseEpisodeItem(rawItem) {
try {
const dataObj = typeof rawItem.data === 'string' ? JSON.parse(rawItem.data) : rawItem.data
const videoObj =
typeof dataObj.videoUrl === 'string' ? JSON.parse(dataObj.videoUrl) : dataObj.videoUrl
return {
id: dataObj.id || rawItem.id,
title: dataObj.title || '',
coverUrl: dataObj.coverUrl || '',
releasedAt: dataObj.releasedAt || 0,
timeLength: dataObj.timeLength || 0,
videoUrl: videoObj?.hd || videoObj?.sd || '',
raw: dataObj
}
} catch (error) {
console.error('解析单条节目失败:', error)
return null
}
}
function getYearByTimestamp(timestamp) {
if (!timestamp) return new Date().getFullYear()
return new Date(timestamp).getFullYear()
}
function saveItemsByYear(items) {
const yearGroups = new Map()
for (const item of items) {
const year = getYearByTimestamp(item.releasedAt)
if (!yearGroups.has(year)) {
yearGroups.set(year, [])
}
yearGroups.get(year).push(item)
}
let totalUniqueAdded = 0
for (const [year, groupItems] of yearGroups.entries()) {
const oldData = readYearData(year)
const itemMap = new Map((oldData.items || []).map(item => [item.id, item]))
const beforeCount = itemMap.size
for (const item of groupItems) {
itemMap.set(item.id, item)
}
const nextItems = Array.from(itemMap.values()).sort((a, b) => (b.releasedAt || 0) - (a.releasedAt || 0))
const afterCount = nextItems.length
totalUniqueAdded += afterCount - beforeCount
writeYearData(year, {
year,
updatedAt: new Date().toISOString(),
items: nextItems
})
}
return totalUniqueAdded
}
function log(...args) {
console.log('[capture]', ...args)
}
async function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms))
}
async function getPageScrollInfo(page) {
return await page.evaluate(() => {
return {
scrollTop: window.scrollY || document.documentElement.scrollTop || document.body.scrollTop || 0,
scrollHeight: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
innerHeight: window.innerHeight
}
})
}
async function scrollStep(page, distance = 700) {
await page.evaluate((step) => {
window.scrollBy(0, step)
}, distance)
}
async function findLoadMoreButton(page) {
const selectors = [
'.index__load-component___1Ht2U button',
'button:has-text("点击加载更多")',
'button:has-text("加载更多")',
'.ant-btn:has-text("点击加载更多")'
]
for (const selector of selectors) {
const locator = page.locator(selector).first()
const count = await page.locator(selector).count().catch(() => 0)
if (!count) continue
const visible = await locator.isVisible().catch(() => false)
if (visible) {
return locator
}
}
return null
}
async function getDomItemCount(page) {
return await page.evaluate(() => {
const selectors = [
'.index__item-component___1q1ob',
'.index__list-item',
'.ant-list-item',
'li'
]
for (const selector of selectors) {
const count = document.querySelectorAll(selector).length
if (count > 0) return count
}
return 0
}).catch(() => 0)
}
async function clickLoadMoreAndWait(page) {
const button = await findLoadMoreButton(page)
if (!button) return false
const responsePromise = page.waitForResponse(
(response) =>
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
response.request().method() === 'GET' &&
response.status() === 200,
{ timeout: 15000 }
).catch(() => null)
await button.scrollIntoViewIfNeeded().catch(() => {})
await sleep(800)
await button.click({ timeout: 10000 }).catch(async () => {
await button.click({ force: true, timeout: 10000 })
})
const response = await responsePromise
await sleep(1800)
return !!response
}
function askContinue() {
const answer = readlineSync.question('\n页面已滑到底部继续采集吗(y/n): ')
return answer.trim().toLowerCase() === 'y'
}
async function autoCollectByScrollAndClick(page, maxRounds = 150) {
let round = 0
let noChangeRounds = 0
let lastDomCount = await getDomItemCount(page)
let lastScrollTop = 0
while (round < maxRounds) {
round += 1
const button = await findLoadMoreButton(page)
if (button) {
log(`${round} 轮:发现“加载更多”按钮,准备点击`)
const clicked = await clickLoadMoreAndWait(page)
const currentDomCount = await getDomItemCount(page)
if (clicked) {
log(`${round} 轮:点击成功,当前列表数量 ${currentDomCount}`)
} else {
log(`${round} 轮:点击后未捕获到新响应`)
}
if (currentDomCount <= lastDomCount) {
noChangeRounds += 1
} else {
noChangeRounds = 0
lastDomCount = currentDomCount
}
await sleep(1200)
continue
}
await scrollStep(page, 700)
await sleep(1400)
const { scrollTop, scrollHeight, innerHeight } = await getPageScrollInfo(page)
const currentDomCount = await getDomItemCount(page)
log(
`${round}继续下滑scrollTop=${scrollTop}scrollHeight=${scrollHeight}items=${currentDomCount}`
)
if (currentDomCount <= lastDomCount && scrollTop === lastScrollTop) {
noChangeRounds += 1
} else {
if (currentDomCount > lastDomCount) {
lastDomCount = currentDomCount
}
noChangeRounds = 0
}
lastScrollTop = scrollTop
const nearBottom = scrollTop + innerHeight >= scrollHeight - 80
if (nearBottom) {
log(`${round} 轮:已经接近页面底部`)
await sleep(1800)
const retryButton = await findLoadMoreButton(page)
if (retryButton) {
log('到底部后重新检测到“加载更多”按钮,继续点击')
continue
}
const shouldContinue = askContinue()
if (!shouldContinue) {
log('你选择结束采集')
break
}
log('你选择继续采集,尝试再次下滑检测')
noChangeRounds = 0
await sleep(1000)
}
if (noChangeRounds >= 6) {
const shouldContinue = askContinue()
if (!shouldContinue) {
log('连续多轮没有新内容,且你选择结束采集')
break
}
log('你选择继续采集,重置无变化计数')
noChangeRounds = 0
}
}
log(`自动采集流程结束,共执行 ${round}`)
}
async function main() {
ensureDataFiles()
writeStatus({
running: true,
lastMessage: '正在启动浏览器采集',
updatedAt: new Date().toISOString()
})
const browser = await chromium.launchPersistentContext('./.pw-user-data', {
headless: false
})
const page = await browser.newPage()
page.on('response', async (response) => {
const url = response.url()
if (!url.includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news')) return
if (response.status() !== 200) return
try {
const data = await response.json()
const parsedItems = (data.list || [])
.map(parseEpisodeItem)
.filter(Boolean)
const addedCount = saveItemsByYear(parsedItems)
const totalItems = parsedItems.length
const currentPage = new URL(url).searchParams.get('currentPage')
writeStatus({
running: true,
lastMessage: `已采集第 ${currentPage}`,
updatedAt: new Date().toISOString(),
currentPage,
pageItems: totalItems,
addedCount
})
log(`采集成功: page=${currentPage} 本页=${totalItems} 新增唯一=${addedCount}`)
} catch (error) {
console.error('[capture] 解析响应失败:', error)
}
})
await page.goto(PAGE_URL, { waitUntil: 'domcontentloaded' })
await page.waitForTimeout(5000)
log('已打开页面:', PAGE_URL)
log('等待第一页接口加载...')
await page.waitForResponse(
(response) =>
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
response.request().method() === 'GET' &&
response.status() === 200,
{ timeout: 15000 }
).catch(() => null)
await sleep(2500)
writeStatus({
running: true,
lastMessage: '开始自动下滑并检测加载更多按钮',
updatedAt: new Date().toISOString()
})
await autoCollectByScrollAndClick(page, 150)
writeStatus({
running: false,
lastMessage: '自动采集完成',
updatedAt: new Date().toISOString()
})
log('自动采集完成,数据已按年份写入 data/*.json')
}
main().catch((error) => {
console.error('采集启动失败:', error)
writeStatus({
running: false,
lastMessage: `采集启动失败: ${error.message}`,
updatedAt: new Date().toISOString()
})
})