修改先下滑一段
• sleep 1.4s • 再检测按钮 • 没按钮就继续下滑 • 有按钮就点 • 点完等接口响应 • 再继续循环
This commit is contained in:
parent
2d5aa1d860
commit
9d97bbc051
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
.DS_Store
|
||||
.pw-user-data
|
||||
node_modules/
|
||||
dist/
|
||||
npm-debug.log*
|
||||
|
||||
316
backServer/capture.js
Normal file
316
backServer/capture.js
Normal file
@ -0,0 +1,316 @@
|
||||
import { chromium } from 'playwright'
|
||||
import { ensureDataFiles, readCache, writeCache, writeStatus } from './lib/cache.js'
|
||||
|
||||
const PAGE_URL = 'https://www1.gdtv.cn/tvColumn/768'
|
||||
|
||||
function parseEpisodeItem(rawItem) {
|
||||
try {
|
||||
const dataObj = typeof rawItem.data === 'string' ? JSON.parse(rawItem.data) : rawItem.data
|
||||
const videoObj =
|
||||
typeof dataObj.videoUrl === 'string' ? JSON.parse(dataObj.videoUrl) : dataObj.videoUrl
|
||||
|
||||
return {
|
||||
id: dataObj.id || rawItem.id,
|
||||
title: dataObj.title || '',
|
||||
coverUrl: dataObj.coverUrl || '',
|
||||
releasedAt: dataObj.releasedAt || 0,
|
||||
timeLength: dataObj.timeLength || 0,
|
||||
videoUrl: videoObj?.hd || videoObj?.sd || '',
|
||||
raw: dataObj
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('解析单条节目失败:', error)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
function mergeCache(oldCache, responseUrl, data) {
|
||||
const url = new URL(responseUrl)
|
||||
const currentPage = Number(url.searchParams.get('currentPage') || 1)
|
||||
const beginScore = Number(url.searchParams.get('beginScore') || 0)
|
||||
|
||||
const parsedItems = (data.list || [])
|
||||
.map(parseEpisodeItem)
|
||||
.filter(Boolean)
|
||||
|
||||
const itemMap = new Map((oldCache.items || []).map(item => [item.id, item]))
|
||||
for (const item of parsedItems) {
|
||||
itemMap.set(item.id, item)
|
||||
}
|
||||
|
||||
return {
|
||||
...oldCache,
|
||||
updatedAt: new Date().toISOString(),
|
||||
name: data.name || oldCache.name || '七十二家房客',
|
||||
coverUrl: data.coverUrl || oldCache.coverUrl || '',
|
||||
displayType: data.displayType ?? oldCache.displayType ?? 0,
|
||||
beginScoreMap: {
|
||||
...(oldCache.beginScoreMap || {}),
|
||||
[currentPage]: beginScore
|
||||
},
|
||||
pages: {
|
||||
...(oldCache.pages || {}),
|
||||
[currentPage]: {
|
||||
currentPage,
|
||||
beginScore,
|
||||
count: parsedItems.length,
|
||||
capturedAt: new Date().toISOString()
|
||||
}
|
||||
},
|
||||
items: Array.from(itemMap.values()).sort((a, b) => (b.releasedAt || 0) - (a.releasedAt || 0))
|
||||
}
|
||||
}
|
||||
|
||||
function log(...args) {
|
||||
console.log('[capture]', ...args)
|
||||
}
|
||||
|
||||
async function sleep(ms) {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms))
|
||||
}
|
||||
|
||||
async function getPageScrollInfo(page) {
|
||||
return await page.evaluate(() => {
|
||||
return {
|
||||
scrollTop: window.scrollY || document.documentElement.scrollTop || document.body.scrollTop || 0,
|
||||
scrollHeight: Math.max(
|
||||
document.body.scrollHeight,
|
||||
document.documentElement.scrollHeight
|
||||
),
|
||||
innerHeight: window.innerHeight
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async function scrollStep(page, distance = 700) {
|
||||
await page.evaluate((step) => {
|
||||
window.scrollBy(0, step)
|
||||
}, distance)
|
||||
}
|
||||
|
||||
async function findLoadMoreButton(page) {
|
||||
const selectors = [
|
||||
'.index__load-component___1Ht2U button',
|
||||
'button:has-text("点击加载更多")',
|
||||
'button:has-text("加载更多")',
|
||||
'.ant-btn:has-text("点击加载更多")'
|
||||
]
|
||||
|
||||
for (const selector of selectors) {
|
||||
const locator = page.locator(selector).first()
|
||||
const count = await page.locator(selector).count().catch(() => 0)
|
||||
if (!count) continue
|
||||
|
||||
const visible = await locator.isVisible().catch(() => false)
|
||||
if (visible) {
|
||||
return locator
|
||||
}
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
async function getDomItemCount(page) {
|
||||
return await page.evaluate(() => {
|
||||
const selectors = [
|
||||
'.index__item-component___1q1ob',
|
||||
'.index__list-item',
|
||||
'.ant-list-item',
|
||||
'li'
|
||||
]
|
||||
|
||||
for (const selector of selectors) {
|
||||
const count = document.querySelectorAll(selector).length
|
||||
if (count > 0) return count
|
||||
}
|
||||
|
||||
return 0
|
||||
}).catch(() => 0)
|
||||
}
|
||||
|
||||
async function clickLoadMoreAndWait(page) {
|
||||
const button = await findLoadMoreButton(page)
|
||||
if (!button) return false
|
||||
|
||||
const responsePromise = page.waitForResponse(
|
||||
(response) =>
|
||||
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
|
||||
response.request().method() === 'GET' &&
|
||||
response.status() === 200,
|
||||
{ timeout: 15000 }
|
||||
).catch(() => null)
|
||||
|
||||
await button.scrollIntoViewIfNeeded().catch(() => {})
|
||||
await sleep(800)
|
||||
|
||||
await button.click({ timeout: 10000 }).catch(async () => {
|
||||
await button.click({ force: true, timeout: 10000 })
|
||||
})
|
||||
|
||||
const response = await responsePromise
|
||||
await sleep(1800)
|
||||
|
||||
return !!response
|
||||
}
|
||||
|
||||
async function autoCollectByScrollAndClick(page, maxRounds = 120) {
|
||||
let round = 0
|
||||
let noChangeRounds = 0
|
||||
let lastDomCount = await getDomItemCount(page)
|
||||
let lastScrollTop = 0
|
||||
|
||||
while (round < maxRounds) {
|
||||
round += 1
|
||||
|
||||
const button = await findLoadMoreButton(page)
|
||||
|
||||
if (button) {
|
||||
log(`第 ${round} 轮:发现“加载更多”按钮,准备点击`)
|
||||
const clicked = await clickLoadMoreAndWait(page)
|
||||
const currentDomCount = await getDomItemCount(page)
|
||||
|
||||
if (clicked) {
|
||||
log(`第 ${round} 轮:点击成功,当前列表数量 ${currentDomCount}`)
|
||||
} else {
|
||||
log(`第 ${round} 轮:点击后未捕获到新响应`)
|
||||
}
|
||||
|
||||
if (currentDomCount <= lastDomCount) {
|
||||
noChangeRounds += 1
|
||||
} else {
|
||||
noChangeRounds = 0
|
||||
lastDomCount = currentDomCount
|
||||
}
|
||||
|
||||
await sleep(1200)
|
||||
continue
|
||||
}
|
||||
|
||||
await scrollStep(page, 700)
|
||||
await sleep(1400)
|
||||
|
||||
const { scrollTop, scrollHeight, innerHeight } = await getPageScrollInfo(page)
|
||||
const currentDomCount = await getDomItemCount(page)
|
||||
|
||||
log(
|
||||
`第 ${round} 轮:继续下滑,scrollTop=${scrollTop},scrollHeight=${scrollHeight},items=${currentDomCount}`
|
||||
)
|
||||
|
||||
if (currentDomCount <= lastDomCount && scrollTop === lastScrollTop) {
|
||||
noChangeRounds += 1
|
||||
} else {
|
||||
if (currentDomCount > lastDomCount) {
|
||||
lastDomCount = currentDomCount
|
||||
}
|
||||
noChangeRounds = 0
|
||||
}
|
||||
|
||||
lastScrollTop = scrollTop
|
||||
|
||||
const nearBottom = scrollTop + innerHeight >= scrollHeight - 80
|
||||
|
||||
if (nearBottom) {
|
||||
log(`第 ${round} 轮:已经接近页面底部`)
|
||||
await sleep(1800)
|
||||
|
||||
const retryButton = await findLoadMoreButton(page)
|
||||
if (!retryButton) {
|
||||
noChangeRounds += 1
|
||||
}
|
||||
}
|
||||
|
||||
if (noChangeRounds >= 6) {
|
||||
log('连续多轮没有新内容,停止采集')
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
log(`自动采集流程结束,共执行 ${round} 轮`)
|
||||
}
|
||||
|
||||
async function main() {
|
||||
ensureDataFiles()
|
||||
|
||||
writeStatus({
|
||||
running: true,
|
||||
lastMessage: '正在启动浏览器采集',
|
||||
updatedAt: new Date().toISOString()
|
||||
})
|
||||
|
||||
const browser = await chromium.launchPersistentContext('./.pw-user-data', {
|
||||
headless: false
|
||||
})
|
||||
|
||||
const page = await browser.newPage()
|
||||
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url()
|
||||
|
||||
if (!url.includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news')) return
|
||||
if (response.status() !== 200) return
|
||||
|
||||
try {
|
||||
const data = await response.json()
|
||||
const oldCache = readCache()
|
||||
const nextCache = mergeCache(oldCache, url, data)
|
||||
writeCache(nextCache)
|
||||
|
||||
const currentPage = new URL(url).searchParams.get('currentPage')
|
||||
const capturedPages = Object.keys(nextCache.pages || {}).length
|
||||
const totalItems = nextCache.items?.length || 0
|
||||
|
||||
writeStatus({
|
||||
running: true,
|
||||
lastMessage: `已采集第 ${currentPage} 页`,
|
||||
updatedAt: new Date().toISOString(),
|
||||
capturedPages,
|
||||
totalItems
|
||||
})
|
||||
|
||||
log(`采集成功: page=${currentPage} totalItems=${totalItems}`)
|
||||
} catch (error) {
|
||||
console.error('[capture] 解析响应失败:', error)
|
||||
}
|
||||
})
|
||||
|
||||
await page.goto(PAGE_URL, { waitUntil: 'domcontentloaded' })
|
||||
await page.waitForTimeout(5000)
|
||||
|
||||
log('已打开页面:', PAGE_URL)
|
||||
log('等待第一页接口加载...')
|
||||
|
||||
await page.waitForResponse(
|
||||
(response) =>
|
||||
response.url().includes('gdtv-api.gdtv.cn/api/tvColumn/v1/news') &&
|
||||
response.request().method() === 'GET' &&
|
||||
response.status() === 200,
|
||||
{ timeout: 15000 }
|
||||
).catch(() => null)
|
||||
|
||||
await sleep(2500)
|
||||
|
||||
writeStatus({
|
||||
running: true,
|
||||
lastMessage: '开始自动下滑并检测加载更多按钮',
|
||||
updatedAt: new Date().toISOString()
|
||||
})
|
||||
|
||||
await autoCollectByScrollAndClick(page, 150)
|
||||
|
||||
writeStatus({
|
||||
running: false,
|
||||
lastMessage: '自动采集完成',
|
||||
updatedAt: new Date().toISOString()
|
||||
})
|
||||
|
||||
log('自动采集完成,数据已写入 data/qishier-cache.json')
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('采集启动失败:', error)
|
||||
writeStatus({
|
||||
running: false,
|
||||
lastMessage: `采集启动失败: ${error.message}`,
|
||||
updatedAt: new Date().toISOString()
|
||||
})
|
||||
})
|
||||
7
backServer/data/capture-status.json
Normal file
7
backServer/data/capture-status.json
Normal file
@ -0,0 +1,7 @@
|
||||
{
|
||||
"running": true,
|
||||
"lastMessage": "已采集第 2 页",
|
||||
"updatedAt": "2026-03-10T19:14:36.004Z",
|
||||
"capturedPages": 26,
|
||||
"totalItems": 1141
|
||||
}
|
||||
35565
backServer/data/qishier-cache.json
Normal file
35565
backServer/data/qishier-cache.json
Normal file
File diff suppressed because it is too large
Load Diff
76
backServer/lib/cache.js
Normal file
76
backServer/lib/cache.js
Normal file
@ -0,0 +1,76 @@
|
||||
import fs from 'node:fs'
|
||||
import path from 'node:path'
|
||||
import { fileURLToPath } from 'node:url'
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url)
|
||||
const __dirname = path.dirname(__filename)
|
||||
|
||||
const dataDir = path.resolve(__dirname, '../data')
|
||||
const cacheFile = path.join(dataDir, 'qishier-cache.json')
|
||||
const statusFile = path.join(dataDir, 'capture-status.json')
|
||||
|
||||
function ensureDir() {
|
||||
if (!fs.existsSync(dataDir)) {
|
||||
fs.mkdirSync(dataDir, { recursive: true })
|
||||
}
|
||||
}
|
||||
|
||||
export function ensureDataFiles() {
|
||||
ensureDir()
|
||||
|
||||
if (!fs.existsSync(cacheFile)) {
|
||||
fs.writeFileSync(
|
||||
cacheFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
updatedAt: null,
|
||||
name: '七十二家房客',
|
||||
coverUrl: '',
|
||||
displayType: 0,
|
||||
pages: {},
|
||||
beginScoreMap: {},
|
||||
items: []
|
||||
},
|
||||
null,
|
||||
2
|
||||
),
|
||||
'utf-8'
|
||||
)
|
||||
}
|
||||
|
||||
if (!fs.existsSync(statusFile)) {
|
||||
fs.writeFileSync(
|
||||
statusFile,
|
||||
JSON.stringify(
|
||||
{
|
||||
running: false,
|
||||
lastMessage: '未启动采集',
|
||||
updatedAt: null
|
||||
},
|
||||
null,
|
||||
2
|
||||
),
|
||||
'utf-8'
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export function readCache() {
|
||||
ensureDataFiles()
|
||||
return JSON.parse(fs.readFileSync(cacheFile, 'utf-8'))
|
||||
}
|
||||
|
||||
export function writeCache(data) {
|
||||
ensureDataFiles()
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(data, null, 2), 'utf-8')
|
||||
}
|
||||
|
||||
export function readStatus() {
|
||||
ensureDataFiles()
|
||||
return JSON.parse(fs.readFileSync(statusFile, 'utf-8'))
|
||||
}
|
||||
|
||||
export function writeStatus(status) {
|
||||
ensureDataFiles()
|
||||
fs.writeFileSync(statusFile, JSON.stringify(status, null, 2), 'utf-8')
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user