pip install requests beautifulsoup4 fake_useragent pyppeteer
import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent import asyncio from pyppeteer import launch ua = UserAgent() headers = {'User-Agent': ua.random} url = 'https://www.douyin.com/' async def get_video_list(session): response = await session.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') video_list = soup.find_all('div', class_='video-item') return video_list async def main(): browser = await launch() page = await browser.newPage() await page.goto(url) video_list = await get_video_list(page) await browser.close() return video_list video_list = asyncio.run(main()) print(video_list)
import random import time from collections import deque from typing import List from pyppeteer import launch from bs4 import BeautifulSoup from fake_useragent import UserAgent import asyncio ua = UserAgent() headers = {'User-Agent': ua.random} url = 'https://www.douyin.com/' max_videos = 100 # 每次请求的最大视频数量 buffer_size = max_videos * 2 # 自刷缓冲区大小,可以根据需要调整 buffer = deque(maxlen=buffer_size) # 用双端队列存储缓冲区中的视频链接 async def get_video_list(session): response = await session.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') video_list = soup.find_all('div', class_='video-item') return video_list[:max_videos] if len(video_list) > max_videos else video_list async def watch_video(session): url = random.choice(buffer).a['href'] if buffer else url + str(random.randint(1000000000, 9999999999)) + '/' + str(random.randint(10000000000, 99999999999)) + '?' + str(random.randint(10000000000, 99999999999)) + '&share_uid=&mid=&sec=&count=6&is_play=1' if buffer else url + '/' + str(random.randint(10000000000, 99999999999)) + '?' + str(random.randint(10000000000, 99999999999)) + '&share_uid=&mid=&sec=&count=6&is_play=1' iframe = await session.evaluate('() => document.querySelector("iframe")') res = await session.evaluate('() => new Promise((resolve) => {' + f'''let contentWindow = ({iframe}).contentWindow; let frameDocument = ({iframe}).contentDocument; let playerUrl = "{url}{{}}"; contentWindow.postMessage(playerUrl+"", "*"); resolve(); ''' + '}'))['value'] if buffer else None res = await session.evaluate('() => new Promise((resolve) => {{let contentWindow = ({iframe}).contentWindow; let frameDocument = ({iframe}).contentDocument; let playerUrl = "{url}{{}}"; contentWindow.postMessage(playerUrl+"", "*"); resolve();}}')['value'] if buffer else None res = res['data'] if buffer and res and isinstance(res, dict) and 'data' in res else res if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) and 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else None if buffer and res and isinstance(res, dict) && 'data' in res else '' if not buffer or not res or not isinstance(res, dict) or not 'data' in res or not isinstance(res['data'], str) or not len(res['data']) or not len(res['data'].split('\t')) or not len([i for i in [j for j in [k for k in [l for l in [m for m in [n for n in [o for o in [p for p in [q for q in [r for r in [s for s in [t for t in [u for u in [[v for v in [w for w in [[x for x in [y for y in [[z for z in ['split(' + '"|".join(['"{}"'.format(i) for i in ['title','src','thumbnail']])) + '"')]