347 lines
9.9 KiB
JavaScript
347 lines
9.9 KiB
JavaScript
import { chromium } from 'playwright'
|
||
import readlineSync from 'readline-sync'
|
||
import {
|
||
ensureDataFiles,
|
||
readYearData,
|
||
writeYearData,
|
||
writeStatus
|
||
} from './lib/cache.js'
|
||
|
||
const PAGE_URL = 'https://www1.gdtv.cn/tvColumn/768'
|
||
|
||
function parseEpisodeItem(rawItem) {
|
||
try {
|
||
const dataObj = typeof rawItem.data === 'string' ? JSON.parse(rawItem.data) : rawItem.data
|
||
const videoObj =
|
||
typeof dataObj.videoUrl === 'string' ? JSON.parse(dataObj.videoUrl) : dataObj.videoUrl
|
||
|
||
return {
|
||
id: dataObj.id || rawItem.id,
|
||
title: dataObj.title || '',
|
||
coverUrl: dataObj.coverUrl || '',
|
||
releasedAt: dataObj.releasedAt || 0,
|
||
timeLength: dataObj.timeLength || 0,
|
||
videoUrl: videoObj?.hd || videoObj?.sd || '',
|
||
raw: dataObj
|
||
}
|
||
} catch (error) {
|
||
console.error('解析单条节目失败:', error)
|
||
return null
|
||
}
|
||
}
|
||
|
||
function getYearByTimestamp(timestamp) {
|
||
if (!timestamp) return new Date().getFullYear()
|
||
return new Date(timestamp).getFullYear()
|
||
}
|
||
|
||
function saveItemsByYear(items) {
|
||
const yearGroups = new Map()
|
||
|
||
for (const item of items) {
|
||
const year = getYearByTimestamp(item.releasedAt)
|
||
if (!yearGroups.has(year)) {
|
||
yearGroups.set(year, [])
|
||
}
|
||
yearGroups.get(year).push(item)
|
||
}
|
||
|
||
let totalUniqueAdded = 0
|
||
|
||
for (const [year, groupItems] of yearGroups.entries()) {
|
||
const oldData = readYearData(year)
|
||
const itemMap = new Map((oldData.items || []).map(item => [item.id, item]))
|
||
const beforeCount = itemMap.size
|
||
|
||
for (const item of groupItems) {
|
||
itemMap.set(item.id, item)
|
||
}
|
||
|
||
const nextItems = Array.from(itemMap.values()).sort((a, b) => (b.releasedAt || 0) - (a.releasedAt || 0))
|
||
const afterCount = nextItems.length
|
||
totalUniqueAdded += afterCount - beforeCount
|
||
|
||
writeYearData(year, {
|
||
year,
|
||
updatedAt: new Date().toISOString(),
|
||
items: nextItems
|
||
})
|
||
}
|
||
|
||
return totalUniqueAdded
|
||
}
|
||
|
||
function log(...args) {
|
||
console.log('[capture]', ...args)
|
||
}
|
||
|
||
async function sleep(ms) {
|
||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||
}
|
||
|
||
async function getPageScrollInfo(page) {
|
||
return await page.evaluate(() => {
|
||
return {
|
||
scrollTop: window.scrollY || document.documentElement.scrollTop || document.body.scrollTop || 0,
|
||
scrollHeight: Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
|
||
innerHeight: window.innerHeight
|
||
}
|
||
})
|
||
}
|
||
|
||
async function scrollStep(page, distance = 700) {
|
||
await page.evaluate((step) => {
|
||
window.scrollBy(0, step)
|
||
}, distance)
|
||
}
|
||
|
||
async function findLoadMoreButton(page) {
|
||
const selectors = [
|
||
'.index__load-component___1Ht2U button',
|
||
'button:has-text("点击加载更多")',
|
||
'button:has-text("加载更多")',
|
||
'.ant-btn:has-text("点击加载更多")'
|
||
]
|
||
|
||
for (const selector of selectors) {
|
||
const locator = page.locator(selector).first()
|
||
const count = await page.locator(selector).count().catch(() => 0)
|
||
if (!count) continue
|
||
|
||
const visible = await locator.isVisible().catch(() => false)
|
||
if (visible) {
|
||
return locator
|
||
}
|
||
}
|
||
|
||
return null
|
||
}
|
||
|
||
async function getDomItemCount(page) {
|
||
return await page.evaluate(() => {
|
||
const selectors = [
|
||
'.index__item-component___1q1ob',
|
||
'.index__list-item',
|
||
'.ant-list-item',
|
||
'li'
|
||
]
|
||
|
||
for (const selector of selectors) {
|
||
const count = document.querySelectorAll(selector).length
|
||
if (count > 0) return count
|
||
}
|
||
|
||
return 0
|
||
}).catch(() => 0)
|
||
}
|
||
|
||
async function clickLoadMoreAndWait(page) {
|
||
const button = await findLoadMoreButton(page)
|
||
if (!button) return false
|
||
|
||
const responsePromise = page.waitForResponse(
|
||
(response) =>
|
||
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
|
||
response.request().method() === 'GET' &&
|
||
response.status() === 200,
|
||
{ timeout: 15000 }
|
||
).catch(() => null)
|
||
|
||
await button.scrollIntoViewIfNeeded().catch(() => {})
|
||
await sleep(800)
|
||
|
||
await button.click({ timeout: 10000 }).catch(async () => {
|
||
await button.click({ force: true, timeout: 10000 })
|
||
})
|
||
|
||
const response = await responsePromise
|
||
await sleep(1800)
|
||
|
||
return !!response
|
||
}
|
||
|
||
function askContinue() {
|
||
const answer = readlineSync.question('\n页面已滑到底部,继续采集吗?(y/n): ')
|
||
return answer.trim().toLowerCase() === 'y'
|
||
}
|
||
|
||
async function autoCollectByScrollAndClick(page, maxRounds = 150) {
|
||
let round = 0
|
||
let noChangeRounds = 0
|
||
let lastDomCount = await getDomItemCount(page)
|
||
let lastScrollTop = 0
|
||
|
||
while (round < maxRounds) {
|
||
round += 1
|
||
|
||
const button = await findLoadMoreButton(page)
|
||
|
||
if (button) {
|
||
log(`第 ${round} 轮:发现“加载更多”按钮,准备点击`)
|
||
const clicked = await clickLoadMoreAndWait(page)
|
||
const currentDomCount = await getDomItemCount(page)
|
||
|
||
if (clicked) {
|
||
log(`第 ${round} 轮:点击成功,当前列表数量 ${currentDomCount}`)
|
||
} else {
|
||
log(`第 ${round} 轮:点击后未捕获到新响应`)
|
||
}
|
||
|
||
if (currentDomCount <= lastDomCount) {
|
||
noChangeRounds += 1
|
||
} else {
|
||
noChangeRounds = 0
|
||
lastDomCount = currentDomCount
|
||
}
|
||
|
||
await sleep(1200)
|
||
continue
|
||
}
|
||
|
||
await scrollStep(page, 700)
|
||
await sleep(1400)
|
||
|
||
const { scrollTop, scrollHeight, innerHeight } = await getPageScrollInfo(page)
|
||
const currentDomCount = await getDomItemCount(page)
|
||
|
||
log(
|
||
`第 ${round} 轮:继续下滑,scrollTop=${scrollTop},scrollHeight=${scrollHeight},items=${currentDomCount}`
|
||
)
|
||
|
||
if (currentDomCount <= lastDomCount && scrollTop === lastScrollTop) {
|
||
noChangeRounds += 1
|
||
} else {
|
||
if (currentDomCount > lastDomCount) {
|
||
lastDomCount = currentDomCount
|
||
}
|
||
noChangeRounds = 0
|
||
}
|
||
|
||
lastScrollTop = scrollTop
|
||
|
||
const nearBottom = scrollTop + innerHeight >= scrollHeight - 80
|
||
|
||
if (nearBottom) {
|
||
log(`第 ${round} 轮:已经接近页面底部`)
|
||
await sleep(1800)
|
||
|
||
const retryButton = await findLoadMoreButton(page)
|
||
if (retryButton) {
|
||
log('到底部后重新检测到“加载更多”按钮,继续点击')
|
||
continue
|
||
}
|
||
|
||
const shouldContinue = askContinue()
|
||
if (!shouldContinue) {
|
||
log('你选择结束采集')
|
||
break
|
||
}
|
||
|
||
log('你选择继续采集,尝试再次下滑检测')
|
||
noChangeRounds = 0
|
||
await sleep(1000)
|
||
}
|
||
|
||
if (noChangeRounds >= 6) {
|
||
const shouldContinue = askContinue()
|
||
if (!shouldContinue) {
|
||
log('连续多轮没有新内容,且你选择结束采集')
|
||
break
|
||
}
|
||
log('你选择继续采集,重置无变化计数')
|
||
noChangeRounds = 0
|
||
}
|
||
}
|
||
|
||
log(`自动采集流程结束,共执行 ${round} 轮`)
|
||
}
|
||
|
||
async function main() {
|
||
ensureDataFiles()
|
||
|
||
writeStatus({
|
||
running: true,
|
||
lastMessage: '正在启动浏览器采集',
|
||
updatedAt: new Date().toISOString()
|
||
})
|
||
|
||
const browser = await chromium.launchPersistentContext('./.pw-user-data', {
|
||
headless: false
|
||
})
|
||
|
||
const page = await browser.newPage()
|
||
|
||
page.on('response', async (response) => {
|
||
const url = response.url()
|
||
|
||
if (!url.includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news')) return
|
||
if (response.status() !== 200) return
|
||
|
||
try {
|
||
const data = await response.json()
|
||
const parsedItems = (data.list || [])
|
||
.map(parseEpisodeItem)
|
||
.filter(Boolean)
|
||
|
||
const addedCount = saveItemsByYear(parsedItems)
|
||
|
||
const totalItems = parsedItems.length
|
||
const currentPage = new URL(url).searchParams.get('currentPage')
|
||
|
||
writeStatus({
|
||
running: true,
|
||
lastMessage: `已采集第 ${currentPage} 页`,
|
||
updatedAt: new Date().toISOString(),
|
||
currentPage,
|
||
pageItems: totalItems,
|
||
addedCount
|
||
})
|
||
|
||
log(`采集成功: page=${currentPage} 本页=${totalItems} 新增唯一=${addedCount}`)
|
||
} catch (error) {
|
||
console.error('[capture] 解析响应失败:', error)
|
||
}
|
||
})
|
||
|
||
await page.goto(PAGE_URL, { waitUntil: 'domcontentloaded' })
|
||
await page.waitForTimeout(5000)
|
||
|
||
log('已打开页面:', PAGE_URL)
|
||
log('等待第一页接口加载...')
|
||
|
||
await page.waitForResponse(
|
||
(response) =>
|
||
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
|
||
response.request().method() === 'GET' &&
|
||
response.status() === 200,
|
||
{ timeout: 15000 }
|
||
).catch(() => null)
|
||
|
||
await sleep(2500)
|
||
|
||
writeStatus({
|
||
running: true,
|
||
lastMessage: '开始自动下滑并检测加载更多按钮',
|
||
updatedAt: new Date().toISOString()
|
||
})
|
||
|
||
await autoCollectByScrollAndClick(page, 150)
|
||
|
||
writeStatus({
|
||
running: false,
|
||
lastMessage: '自动采集完成',
|
||
updatedAt: new Date().toISOString()
|
||
})
|
||
|
||
log('自动采集完成,数据已按年份写入 data/*.json')
|
||
}
|
||
|
||
main().catch((error) => {
|
||
console.error('采集启动失败:', error)
|
||
writeStatus({
|
||
running: false,
|
||
lastMessage: `采集启动失败: ${error.message}`,
|
||
updatedAt: new Date().toISOString()
|
||
})
|
||
})
|