|
@@ -0,0 +1,692 @@
|
|
|
|
|
+先说结论:
|
|
|
|
|
+用 Web Scraper API(Pinterest 套件)可以比较接近你的 3 步工作流,但它是“预置爬虫”,不能像自己写代码那样一步步“点击页面元素”。
|
|
|
|
|
+支持的能力大致是:
|
|
|
|
|
+
|
|
|
|
|
+通过关键词发现博主(Profiles by keyword)
|
|
|
|
|
+通过博主 URL 拉取该博主的帖子列表(按个人资料 URL 发布)
|
|
|
|
|
+通过帖子 URL 拉取该帖的详情和评论(含一个层级的评论数组 ,目前文档里没有“继续展开更多回复”的二次交互能力)comments
|
|
|
|
|
+也就是说:
|
|
|
|
|
+
|
|
|
|
|
+“平铺式评论列表”可以拿到( 字段);comments
|
|
|
|
|
+“点击查看更多回复再抓子回复”这种交互,Web Scraper API 目前不支持你自定义脚本去点按钮,只能拿到爬虫内置能抓到的评论层级。
|
|
|
|
|
+下面我按你的 3 步工作流,用 Node.js + Web Scraper API 给你完整示例(包括请求 & 典型 JSON 结构),你只需要换成你自己的 API key 和具体 dataset_id 即可运行。
|
|
|
|
|
+
|
|
|
|
|
+通用前提
|
|
|
|
|
+1. 安装依赖
|
|
|
|
|
+砰
|
|
|
|
|
+
|
|
|
|
|
+npm install axios
|
|
|
|
|
+2. 认证方式
|
|
|
|
|
+在账号设置里生成 API key(我们叫 API key,不是 token):
|
|
|
|
|
+文档:How to get a API Key
|
|
|
|
|
+
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const axios = require('axios');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY'; // 换成你的
|
|
|
|
|
+const CUSTOMER_ID = 'hl_3202e698'; // 你的 account id
|
|
|
|
|
+所有 Web Scraper API 请求都走这个触发端点(异步):
|
|
|
|
|
+
|
|
|
|
|
+触发采集:POST https://api.brightdata.com/datasets/v3/trigger?dataset_id=DATASET_ID
|
|
|
|
|
+查询 / 下载结果:GET https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}
|
|
|
|
|
+文档(英文):
|
|
|
|
|
+
|
|
|
|
|
+Web Scraper API 快速开始:Web Scraper API 快速入门
|
|
|
|
|
+Pinterest 抓取器说明:Pinterest API 抓取器
|
|
|
|
|
+步骤 1:按关键词 + 地区发现 Pinterest 博主
|
|
|
|
|
+使用:Pinterest Profiles API – 通过关键词发现
|
|
|
|
|
+
|
|
|
|
|
+在界面里路径大致是:
|
|
|
|
|
+Data → Web Scraper API → Scraper Library → 搜索 “Pinterest” → 选择 “Profiles API” → 选择 “Discover by Keywords”
|
|
|
|
|
+在这个页面右上角可以看到对应的 (形如 ,真正调用时只用主 ID 部分,比如 ,地区通常作为输入字段)。dataset_idgd_lk0zv93c2m9qdph46z/keywordgd_lk0zv93c2m9qdph46z
|
|
|
|
|
+
|
|
|
|
|
+Node.js 调用示例(发现博主)
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const axios = require('axios');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY';
|
|
|
|
|
+const DATASET_ID_PROFILES_KEYWORD = 'YOUR_PROFILES_DATASET_ID'; // 比如 gd_lk0zv93c2m9qdph46z
|
|
|
|
|
+
|
|
|
|
|
+async function discoverCreatorsByKeyword() {
|
|
|
|
|
+ const url = `https://api.brightdata.com/datasets/v3/trigger?dataset_id=${DATASET_ID_PROFILES_KEYWORD}`;
|
|
|
|
|
+
|
|
|
|
|
+ // 这里的字段名要以 Scraper Library 里“Inputs”面板为准
|
|
|
|
|
+ const body = [
|
|
|
|
|
+ {
|
|
|
|
|
+ keyword: 'fashion', // 你的关键词
|
|
|
|
|
+ country: 'US' // 地区/国家代码,具体字段名以界面为准,有的叫 country,有的叫 country_code
|
|
|
|
|
+ }
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ const triggerRes = await axios.post(url, body, {
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ Authorization: `Bearer ${API_KEY}`,
|
|
|
|
|
+ 'Content-Type': 'application/json'
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ const snapshotId = triggerRes.data.snapshot_id;
|
|
|
|
|
+ console.log('Profiles job triggered, snapshot_id:', snapshotId);
|
|
|
|
|
+
|
|
|
|
|
+ // 轮询直到 ready
|
|
|
|
|
+ let statusData;
|
|
|
|
|
+ while (true) {
|
|
|
|
|
+ const statusRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+ statusData = statusRes.data;
|
|
|
|
|
+ console.log('Status:', statusData.status);
|
|
|
|
|
+
|
|
|
|
|
+ if (statusData.status === 'ready') break;
|
|
|
|
|
+ if (statusData.status === 'failed') {
|
|
|
|
|
+ throw new Error('Profiles collection failed');
|
|
|
|
|
+ }
|
|
|
|
|
+ await new Promise(r => setTimeout(r, 5000));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // 拿结果(两种方式:直接从 statusData.data,或再加 format=json 下载)
|
|
|
|
|
+ const downloadRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}?format=json`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ const creators = downloadRes.data;
|
|
|
|
|
+ console.log('Creators count:', creators.length);
|
|
|
|
|
+ console.dir(creators[0], { depth: null });
|
|
|
|
|
+
|
|
|
|
|
+ return creators;
|
|
|
|
|
+ } catch (err) {
|
|
|
|
|
+ console.error('Error discovering creators:', err.response?.data || err.message);
|
|
|
|
|
+ throw err;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+discoverCreatorsByKeyword();
|
|
|
|
|
+典型返回 JSON(单条博主数据)
|
|
|
|
|
+字段以文档为准(Profiles – 通过关键词发现):
|
|
|
|
|
+Profiles API 文档
|
|
|
|
|
+
|
|
|
|
|
+大致结构类似:
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/creator_username/",
|
|
|
|
|
+ "profile_picture": "https://i.pinimg.com/...",
|
|
|
|
|
+ "name": "Creator Name",
|
|
|
|
|
+ "nickname": "creator_username",
|
|
|
|
|
+ "website": "https://creator-site.com",
|
|
|
|
|
+ "bio": "Short bio text",
|
|
|
|
|
+ "country_code": "US",
|
|
|
|
|
+ "profile_id": "1234567890",
|
|
|
|
|
+ "following_count": 120,
|
|
|
|
|
+ "follower_count": 54000,
|
|
|
|
|
+ "boards_num": 12,
|
|
|
|
|
+ "saved": 3400,
|
|
|
|
|
+ "posts_page_url": "https://www.pinterest.com/creator_username/_created/",
|
|
|
|
|
+ "last_updated": "2026-02-14T03:00:00Z",
|
|
|
|
|
+ "discovery_input": {
|
|
|
|
|
+ "keyword": "fashion",
|
|
|
|
|
+ "country": "US"
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+你在前端“点击博主”时,只需要把 或 传给下一步即可。urlposts_page_url
|
|
|
|
|
+
|
|
|
|
|
+步骤 2:点击博主 → 拉取该博主的视频 / 帖子列表
|
|
|
|
|
+使用:Pinterest Posts API – 通过个人资料 URL 发现
|
|
|
|
|
+
|
|
|
|
|
+在 Scraper Library 里选择 Pinterest → Posts API →“Discover by Profile URL”。
|
|
|
|
|
+
|
|
|
|
|
+Node.js 调用示例(按博主 URL 拉取帖子)
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const axios = require('axios');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY';
|
|
|
|
|
+const DATASET_ID_POSTS_BY_PROFILE = 'YOUR_POSTS_PROFILE_DATASET_ID'; // 比如 gd_lk0sjs4d21kdr7cnlv
|
|
|
|
|
+
|
|
|
|
|
+async function getPostsByCreatorProfile(profileUrl) {
|
|
|
|
|
+ const url = `https://api.brightdata.com/datasets/v3/trigger?dataset_id=${DATASET_ID_POSTS_BY_PROFILE}`;
|
|
|
|
|
+
|
|
|
|
|
+ const body = [
|
|
|
|
|
+ {
|
|
|
|
|
+ URL: profileUrl, // 注意字段名大小写,以 Scraper Library “Inputs” 为准,文档里是 URL
|
|
|
|
|
+ num_of_posts: 50, // 可选:限制数量
|
|
|
|
|
+ // start_date: '01-01-2025',
|
|
|
|
|
+ // end_date: '12-31-2025',
|
|
|
|
|
+ // posts_to_not_include: ['1234567890']
|
|
|
|
|
+ }
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ const triggerRes = await axios.post(url, body, {
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ Authorization: `Bearer ${API_KEY}`,
|
|
|
|
|
+ 'Content-Type': 'application/json'
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ const snapshotId = triggerRes.data.snapshot_id;
|
|
|
|
|
+ console.log('Posts job triggered, snapshot_id:', snapshotId);
|
|
|
|
|
+
|
|
|
|
|
+ let statusData;
|
|
|
|
|
+ while (true) {
|
|
|
|
|
+ const statusRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+ statusData = statusRes.data;
|
|
|
|
|
+ console.log('Status:', statusData.status);
|
|
|
|
|
+
|
|
|
|
|
+ if (statusData.status === 'ready') break;
|
|
|
|
|
+ if (statusData.status === 'failed') {
|
|
|
|
|
+ throw new Error('Posts collection failed');
|
|
|
|
|
+ }
|
|
|
|
|
+ await new Promise(r => setTimeout(r, 5000));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const downloadRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}?format=json`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ const posts = downloadRes.data;
|
|
|
|
|
+ console.log('Posts count:', posts.length);
|
|
|
|
|
+ console.dir(posts[0], { depth: null });
|
|
|
|
|
+
|
|
|
|
|
+ return posts;
|
|
|
|
|
+ } catch (err) {
|
|
|
|
|
+ console.error('Error getting posts:', err.response?.data || err.message);
|
|
|
|
|
+ throw err;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// 示例:从上一步返回的 creators[0].url 传入
|
|
|
|
|
+// getPostsByCreatorProfile('https://www.pinterest.com/creator_username/');
|
|
|
|
|
+典型返回 JSON(单条帖子 / 视频数据)
|
|
|
|
|
+参考文档中 Posts – 通过配置文件发现 URL 的输出结构:
|
|
|
|
|
+Posts API 文档
|
|
|
|
|
+
|
|
|
|
|
+大致类似:
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/pin/1234567890/",
|
|
|
|
|
+ "post_id": "1234567890",
|
|
|
|
|
+ "title": "Outfit ideas for spring",
|
|
|
|
|
+ "content": "Some description text...",
|
|
|
|
|
+ "date_posted": "2026-02-10T12:34:56Z",
|
|
|
|
|
+ "post_type": "video",
|
|
|
|
|
+ "user_name": "Creator Name",
|
|
|
|
|
+ "user_url": "https://www.pinterest.com/creator_username/",
|
|
|
|
|
+ "user_id": "987654321",
|
|
|
|
|
+ "followers": 54000,
|
|
|
|
|
+ "likes": 1200,
|
|
|
|
|
+ "comments_num": 85,
|
|
|
|
|
+ "comments": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "comment_id": "c1",
|
|
|
|
|
+ "author_name": "User A",
|
|
|
|
|
+ "author_url": "https://www.pinterest.com/user_a/",
|
|
|
|
|
+ "text": "Love this!",
|
|
|
|
|
+ "date": "2026-02-11T08:00:00Z",
|
|
|
|
|
+ "likes": 10
|
|
|
|
|
+ // 有的版本里可能还有 replies_count 等字段,具体以实际 schema 为准
|
|
|
|
|
+ }
|
|
|
|
|
+ ],
|
|
|
|
|
+ "categories": ["Fashion", "Outfits"],
|
|
|
|
|
+ "image_video_url": "https://v.pinimg.com/videos/...",
|
|
|
|
|
+ "video_length": 23,
|
|
|
|
|
+ "attached_files": [],
|
|
|
|
|
+ "hashtags": ["#fashion", "#springoutfit"],
|
|
|
|
|
+ "source": "pinterest",
|
|
|
|
|
+ "discovery_input": {
|
|
|
|
|
+ "URL": "https://www.pinterest.com/creator_username/",
|
|
|
|
|
+ "num_of_posts": 50
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+你在前端“点击视频”时,只需要把 或 传给下一步。urlpost_id
|
|
|
|
|
+
|
|
|
|
|
+步骤 3:点击视频 → 获取评论列表(及“查看更多回复”)
|
|
|
|
|
+这里有一个关键限制:
|
|
|
|
|
+
|
|
|
|
|
+Web Scraper API 的 Pinterest Posts API 在“Collect by URL”模式下,会返回该帖的详情和 数组;comments
|
|
|
|
|
+但它不会像浏览器自动化那样帮你“点击查看更多回复”多次去展开所有子回复;
|
|
|
|
|
+文档里只明确了 和 等字段,没有单独的“replylies”数组说明,是否包含子回复、包含到什么层级,取决于当前爬虫实现,不能通过 API 参数控制。commentscomments_num
|
|
|
|
|
+如果你只需要“平铺式评论列表”(不强制区分主评论 / 子回复),可以直接用 字段;
|
|
|
|
|
+如果你必须精确控制“点击查看更多回复”的行为,那就需要用 Scraper Studio 自定义脚本或 Scraping Browser,而不是 Web Scraper API 现成 Pinterest 套件。comments
|
|
|
|
|
+
|
|
|
|
|
+Node.js 调用示例(按帖子 URL 拉取评论)
|
|
|
|
|
+使用:Pinterest Posts API – Collect by URL(和上一步是同一个 Posts API,只是换成 “Collect by URL” 这个端点,对应的 dataset_id 可能是同一个,也可能是另一个,取决于库里配置;以 Scraper Library 页面为准)。
|
|
|
|
|
+
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const axios = require('axios');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY';
|
|
|
|
|
+const DATASET_ID_POST_BY_URL = 'YOUR_POST_BY_URL_DATASET_ID'; // 比如 gd_lk0sjs4d21kdr7cnlv
|
|
|
|
|
+
|
|
|
|
|
+async function getPostWithComments(postUrl) {
|
|
|
|
|
+ const url = `https://api.brightdata.com/datasets/v3/trigger?dataset_id=${DATASET_ID_POST_BY_URL}`;
|
|
|
|
|
+
|
|
|
|
|
+ const body = [
|
|
|
|
|
+ {
|
|
|
|
|
+ URL: postUrl // 帖子 URL,例如 https://www.pinterest.com/pin/1234567890/
|
|
|
|
|
+ }
|
|
|
|
|
+ ];
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ const triggerRes = await axios.post(url, body, {
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ Authorization: `Bearer ${API_KEY}`,
|
|
|
|
|
+ 'Content-Type': 'application/json'
|
|
|
|
|
+ }
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ const snapshotId = triggerRes.data.snapshot_id;
|
|
|
|
|
+ console.log('Post-by-URL job triggered, snapshot_id:', snapshotId);
|
|
|
|
|
+
|
|
|
|
|
+ let statusData;
|
|
|
|
|
+ while (true) {
|
|
|
|
|
+ const statusRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+ statusData = statusRes.data;
|
|
|
|
|
+ console.log('Status:', statusData.status);
|
|
|
|
|
+
|
|
|
|
|
+ if (statusData.status === 'ready') break;
|
|
|
|
|
+ if (statusData.status === 'failed') {
|
|
|
|
|
+ throw new Error('Post-by-URL collection failed');
|
|
|
|
|
+ }
|
|
|
|
|
+ await new Promise(r => setTimeout(r, 5000));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const downloadRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}?format=json`,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: { Authorization: `Bearer ${API_KEY}` }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+
|
|
|
|
|
+ const [post] = downloadRes.data;
|
|
|
|
|
+ console.dir(post, { depth: null });
|
|
|
|
|
+
|
|
|
|
|
+ // 你可以在这里把 comments “平铺”成你需要的结构
|
|
|
|
|
+ const flatComments = (post.comments || []).map(c => ({
|
|
|
|
|
+ comment_id: c.comment_id,
|
|
|
|
|
+ author_name: c.author_name,
|
|
|
|
|
+ text: c.text,
|
|
|
|
|
+ date: c.date,
|
|
|
|
|
+ likes: c.likes
|
|
|
|
|
+ // 如果 schema 里有 replies 或 replies_count,可以在这里展开或标记
|
|
|
|
|
+ }));
|
|
|
|
|
+
|
|
|
|
|
+ console.log('Flat comments count:', flatComments.length);
|
|
|
|
|
+ return { post, flatComments };
|
|
|
|
|
+ } catch (err) {
|
|
|
|
|
+ console.error('Error getting post comments:', err.response?.data || err.message);
|
|
|
|
|
+ throw err;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// 示例:从上一步 posts[0].url 传入
|
|
|
|
|
+// getPostWithComments('https://www.pinterest.com/pin/1234567890/');
|
|
|
|
|
+典型返回 JSON(含评论)
|
|
|
|
|
+和上一步类似,只是你通常只传一个 URL,返回一个对象:
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/pin/1234567890/",
|
|
|
|
|
+ "post_id": "1234567890",
|
|
|
|
|
+ "title": "Outfit ideas for spring",
|
|
|
|
|
+ "comments_num": 85,
|
|
|
|
|
+ "comments": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "comment_id": "c1",
|
|
|
|
|
+ "author_name": "User A",
|
|
|
|
|
+ "author_url": "https://www.pinterest.com/user_a/",
|
|
|
|
|
+ "text": "Love this!",
|
|
|
|
|
+ "date": "2026-02-11T08:00:00Z",
|
|
|
|
|
+ "likes": 10
|
|
|
|
|
+ // 这里是否有 replies / replies_count,要以实际 schema 为准
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ "comment_id": "c2",
|
|
|
|
|
+ "author_name": "User B",
|
|
|
|
|
+ "text": "Where did you buy this?",
|
|
|
|
|
+ "date": "2026-02-11T09:00:00Z",
|
|
|
|
|
+ "likes": 3
|
|
|
|
|
+ }
|
|
|
|
|
+ ],
|
|
|
|
|
+ "image_video_url": "https://v.pinimg.com/videos/...",
|
|
|
|
|
+ "video_length": 23,
|
|
|
|
|
+ "hashtags": ["#fashion", "#springoutfit"],
|
|
|
|
|
+ "discovery_input": {
|
|
|
|
|
+ "URL": "https://www.pinterest.com/pin/1234567890/"
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+关于“查看更多回复”的现实情况
|
|
|
|
|
+Web Scraper API(Pinterest 套件)是“无代码预置爬虫”,你只能通过输入参数控制采集范围,不能写脚本去点“查看更多回复”按钮;
|
|
|
|
|
+如果你业务上必须精确抓到每条评论的所有子回复,有两种更灵活的方案:
|
|
|
|
|
+Scraper Studio 自定义 Pinterest 爬虫:自己写(或用 AI 生成)脚本,显式执行点击、滚动、等待等作;
|
|
|
|
|
+Scraping Browser + 你自己的 Node.js 逻辑:用 CDP / Selenium 控制浏览器,完全模拟人作。
|
|
|
|
|
+这两种都可以做到你说的“点击评论下方查看更多回复再抓子评论”,但就不再是 Web Scraper API 现成 Pinterest 套件了。
|
|
|
|
|
+
|
|
|
|
|
+帮你把 3 步串成一个 Node.js 工作流(简化版)
|
|
|
|
|
+下面是一个“从关键词 → 选博主 → 选帖子 → 拉评论”的串联示例,方便你对接现有系统:
|
|
|
|
|
+
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const axios = require('axios');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY';
|
|
|
|
|
+const DATASET_ID_PROFILES_KEYWORD = 'YOUR_PROFILES_DATASET_ID';
|
|
|
|
|
+const DATASET_ID_POSTS_BY_PROFILE = 'YOUR_POSTS_PROFILE_DATASET_ID';
|
|
|
|
|
+const DATASET_ID_POST_BY_URL = 'YOUR_POST_BY_URL_DATASET_ID';
|
|
|
|
|
+
|
|
|
|
|
+async function triggerAndWait(datasetId, body) {
|
|
|
|
|
+ const triggerRes = await axios.post(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/trigger?dataset_id=${datasetId}`,
|
|
|
|
|
+ body,
|
|
|
|
|
+ {
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ Authorization: `Bearer ${API_KEY}`,
|
|
|
|
|
+ 'Content-Type': 'application/json'
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ );
|
|
|
|
|
+ const snapshotId = triggerRes.data.snapshot_id;
|
|
|
|
|
+
|
|
|
|
|
+ while (true) {
|
|
|
|
|
+ const statusRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}`,
|
|
|
|
|
+ { headers: { Authorization: `Bearer ${API_KEY}` } }
|
|
|
|
|
+ );
|
|
|
|
|
+ const statusData = statusRes.data;
|
|
|
|
|
+ if (statusData.status === 'ready') break;
|
|
|
|
|
+ if (statusData.status === 'failed') throw new Error('Job failed');
|
|
|
|
|
+ await new Promise(r => setTimeout(r, 5000));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ const downloadRes = await axios.get(
|
|
|
|
|
+ `https://api.brightdata.com/datasets/v3/snapshot/${snapshotId}?format=json`,
|
|
|
|
|
+ { headers: { Authorization: `Bearer ${API_KEY}` } }
|
|
|
|
|
+ );
|
|
|
|
|
+ return downloadRes.data;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+async function mainWorkflow() {
|
|
|
|
|
+ // 1) 关键词 + 地区发现博主
|
|
|
|
|
+ const creators = await triggerAndWait(DATASET_ID_PROFILES_KEYWORD, [
|
|
|
|
|
+ { keyword: 'fashion', country: 'US' }
|
|
|
|
|
+ ]);
|
|
|
|
|
+ const creator = creators[0];
|
|
|
|
|
+ console.log('Picked creator:', creator.url);
|
|
|
|
|
+
|
|
|
|
|
+ // 2) 用博主 URL 拉取帖子列表
|
|
|
|
|
+ const posts = await triggerAndWait(DATASET_ID_POSTS_BY_PROFILE, [
|
|
|
|
|
+ { URL: creator.url, num_of_posts: 20 }
|
|
|
|
|
+ ]);
|
|
|
|
|
+ const post = posts[0];
|
|
|
|
|
+ console.log('Picked post:', post.url);
|
|
|
|
|
+
|
|
|
|
|
+ // 3) 用帖子 URL 拉取评论
|
|
|
|
|
+ const [postWithComments] = await triggerAndWait(DATASET_ID_POST_BY_URL, [
|
|
|
|
|
+ { URL: post.url }
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ console.log('Comments_num:', postWithComments.comments_num);
|
|
|
|
|
+ console.log('First comment:', postWithComments.comments?.[0]);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+mainWorkflow().catch(console.error);
|
|
|
|
|
+接下来可以怎么做?
|
|
|
|
|
+你可以先在 Scraper Library 里找到这 3 个 Pinterest endpoint,对照界面确认:
|
|
|
|
|
+输入字段名( / / / 等)keywordcountryURLnum_of_posts
|
|
|
|
|
+输出字段(特别是 里是否有 或 )commentsrepliesreplies_count
|
|
|
|
|
+把上面的 Node.js 示例里 和各自的 换成你自己的,跑一遍看看真实返回结构。YOUR_API_KEYDATASET_ID
|
|
|
|
|
+如果你愿意,可以把你在 Scraper Library 里看到的:
|
|
|
|
|
+
|
|
|
|
|
+具体的 dataset_id(或截图里的 endpoint 名称)
|
|
|
|
|
+你期望的“平铺评论 + 回复”的最终 JSON 结构
|
|
|
|
|
+发给我,我可以帮你把 Node.js 代码改成完全贴合你目标 JSON 的版本(包括如何把 转成你要的扁平结构)。comments
|
|
|
|
|
+
|
|
|
|
|
+如果你确实需要“强制展开所有回复”,我也可以帮你评估是用 Scraper Studio 还是 Scraping Browser 更合适。
|
|
|
|
|
+
|
|
|
|
|
+最后,如果这次说明对你有帮助,欢迎用下方的拇指按钮给个反馈。
|
|
|
|
|
+
|
|
|
|
|
+不能直接走get/post请求的方式吗
|
|
|
|
|
+
|
|
|
|
|
+可以的,你现在看到的所有示例,本质上就是“纯 HTTP POST/GET 请求”,不需要任何 SDK,只是我用 Node.js 的 axios 来发请求而已。
|
|
|
|
|
+Web Scraper API 的调用方式就是标准的 REST API:
|
|
|
|
|
+
|
|
|
|
|
+触发采集:POST https://api.brightdata.com/datasets/v3/trigger?dataset_id=...
|
|
|
|
|
+查询 / 下载结果:GET https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}
|
|
|
|
|
+下面我给你一个完全最小化的 Node.js 示例,只用原生 ,你可以看到它就是标准的 POST / GET。https.request
|
|
|
|
|
+
|
|
|
|
|
+1)POST 触发采集(发现博主 / 拉帖子 / 拉评论都是同一个模式)
|
|
|
|
|
+以“按关键词发现博主”为例(Profiles by keyword):
|
|
|
|
|
+
|
|
|
|
|
+JS
|
|
|
|
|
+
|
|
|
|
|
+const https = require('https');
|
|
|
|
|
+
|
|
|
|
|
+const API_KEY = 'YOUR_API_KEY';
|
|
|
|
|
+const DATASET_ID = 'YOUR_PROFILES_DATASET_ID'; // 比如 gd_lk0zv93c2m9qdph46z
|
|
|
|
|
+
|
|
|
|
|
+function triggerJob() {
|
|
|
|
|
+ const postData = JSON.stringify([
|
|
|
|
|
+ {
|
|
|
|
|
+ keyword: 'fashion',
|
|
|
|
|
+ country: 'US'
|
|
|
|
|
+ }
|
|
|
|
|
+ ]);
|
|
|
|
|
+
|
|
|
|
|
+ const options = {
|
|
|
|
|
+ hostname: 'api.brightdata.com',
|
|
|
|
|
+ path: `/datasets/v3/trigger?dataset_id=${DATASET_ID}`,
|
|
|
|
|
+ method: 'POST',
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ 'Authorization': `Bearer ${API_KEY}`,
|
|
|
|
|
+ 'Content-Type': 'application/json',
|
|
|
|
|
+ 'Content-Length': Buffer.byteLength(postData)
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ const req = https.request(options, (res) => {
|
|
|
|
|
+ let data = '';
|
|
|
|
|
+ res.on('data', (chunk) => (data += chunk));
|
|
|
|
|
+ res.on('end', () => {
|
|
|
|
|
+ console.log('Trigger response:', data);
|
|
|
|
|
+ const json = JSON.parse(data);
|
|
|
|
|
+ const snapshotId = json.snapshot_id;
|
|
|
|
|
+ console.log('snapshot_id:', snapshotId);
|
|
|
|
|
+
|
|
|
|
|
+ // 触发成功后,用 GET 去拿结果
|
|
|
|
|
+ pollSnapshot(snapshotId);
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ req.on('error', (e) => {
|
|
|
|
|
+ console.error('Request error:', e);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ req.write(postData);
|
|
|
|
|
+ req.end();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+function pollSnapshot(snapshotId) {
|
|
|
|
|
+ const options = {
|
|
|
|
|
+ hostname: 'api.brightdata.com',
|
|
|
|
|
+ path: `/datasets/v3/snapshot/${snapshotId}?format=json`,
|
|
|
|
|
+ method: 'GET',
|
|
|
|
|
+ headers: {
|
|
|
|
|
+ 'Authorization': `Bearer ${API_KEY}`
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ const req = https.request(options, (res) => {
|
|
|
|
|
+ let data = '';
|
|
|
|
|
+ res.on('data', (chunk) => (data += chunk));
|
|
|
|
|
+ res.on('end', () => {
|
|
|
|
|
+ const json = JSON.parse(data);
|
|
|
|
|
+
|
|
|
|
|
+ // 如果你只想“直接 GET 一次”,那就这样:拿到什么就用什么
|
|
|
|
|
+ // 但要注意:如果 job 还没 ready,这里可能是 status 信息而不是最终数据
|
|
|
|
|
+ console.log('Snapshot raw response:', JSON.stringify(json, null, 2));
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ req.on('error', (e) => {
|
|
|
|
|
+ console.error('Snapshot error:', e);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ req.end();
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+triggerJob();
|
|
|
|
|
+你可以看到:
|
|
|
|
|
+
|
|
|
|
|
+第一步是一个标准的 请求,body 是 JSON 数组(输入参数);POST
|
|
|
|
|
+第二步是一个标准的 请求,带 头;GETAuthorization
|
|
|
|
|
+没有用任何 SDK,只是 Node.js 自带的 模块。https
|
|
|
|
|
+你如果愿意,也可以用 、任何 HTTP 客户端,本质都是一样的 GET/POST。curlfetch
|
|
|
|
|
+
|
|
|
|
|
+2)为什么“不能只用一个 GET 一步到位”
|
|
|
|
|
+你现在的需求是:
|
|
|
|
|
+
|
|
|
|
|
+关键词 + 地区 → 找博主
|
|
|
|
|
+点博主 → 拉帖子列表
|
|
|
|
|
+点帖子 → 拉评论(+ 尽量多的回复)
|
|
|
|
|
+Web Scraper API 的设计是异步任务模式:
|
|
|
|
|
+
|
|
|
|
|
+你必须先 创建一个采集任务(这一步告诉系统:我要抓什么、抓多少);POST /trigger
|
|
|
|
|
+系统在后台跑浏览器、过反爬、翻页等;
|
|
|
|
|
+任务完成后,你再用 去拿结果。GET /snapshot/{id}
|
|
|
|
|
+所以从协议层面讲:
|
|
|
|
|
+
|
|
|
|
|
+不能只用一个 GET URL 就完成所有事情,因为你需要先告诉系统“要抓什么”(这一步必须是 POST,带 JSON body);
|
|
|
|
|
+但你完全可以只用“原生 HTTP GET/POST”,不需要任何 SDK 或额外轮询逻辑库——轮询就是你自己写一个 setTimeout 或 while 循环发 GET 而已。
|
|
|
|
|
+3)按你的 3 步工作流,对应的“纯 HTTP”调用方式
|
|
|
|
|
+步骤 1:关键词 + 地区 → 博主列表
|
|
|
|
|
+POST /datasets/v3/trigger?dataset_id=PROFILES_KEYWORD_ID
|
|
|
|
|
+body(JSON)示例:
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+[
|
|
|
|
|
+ {
|
|
|
|
|
+ "keyword": "fashion",
|
|
|
|
|
+ "country": "US"
|
|
|
|
|
+ }
|
|
|
|
|
+]
|
|
|
|
|
+然后 拿结果。GET /datasets/v3/snapshot/{snapshot_id}?format=json
|
|
|
|
|
+返回 JSON(单条示例):
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/creator_username/",
|
|
|
|
|
+ "name": "Creator Name",
|
|
|
|
|
+ "nickname": "creator_username",
|
|
|
|
|
+ "country_code": "US",
|
|
|
|
|
+ "profile_id": "1234567890",
|
|
|
|
|
+ "follower_count": 54000,
|
|
|
|
|
+ "posts_page_url": "https://www.pinterest.com/creator_username/_created/",
|
|
|
|
|
+ "discovery_input": {
|
|
|
|
|
+ "keyword": "fashion",
|
|
|
|
|
+ "country": "US"
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+步骤 2:点击博主 → 帖子 / 视频列表
|
|
|
|
|
+POST /datasets/v3/trigger?dataset_id=POSTS_BY_PROFILE_ID
|
|
|
|
|
+body 示例:
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+[
|
|
|
|
|
+ {
|
|
|
|
|
+ "URL": "https://www.pinterest.com/creator_username/",
|
|
|
|
|
+ "num_of_posts": 20
|
|
|
|
|
+ }
|
|
|
|
|
+]
|
|
|
|
|
+然后 。GET /datasets/v3/snapshot/{snapshot_id}?format=json
|
|
|
|
|
+返回 JSON(单条帖子):
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/pin/1234567890/",
|
|
|
|
|
+ "post_id": "1234567890",
|
|
|
|
|
+ "title": "Outfit ideas for spring",
|
|
|
|
|
+ "post_type": "video",
|
|
|
|
|
+ "image_video_url": "https://v.pinimg.com/videos/...",
|
|
|
|
|
+ "video_length": 23,
|
|
|
|
|
+ "likes": 1200,
|
|
|
|
|
+ "comments_num": 85,
|
|
|
|
|
+ "comments": [ /* 可能有一部分评论 */ ],
|
|
|
|
|
+ "user_name": "Creator Name",
|
|
|
|
|
+ "user_url": "https://www.pinterest.com/creator_username/"
|
|
|
|
|
+}
|
|
|
|
|
+步骤 3:点击视频 → 评论列表(+ 尽量多的回复)
|
|
|
|
|
+POST /datasets/v3/trigger?dataset_id=POST_BY_URL_ID
|
|
|
|
|
+body 示例:
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+[
|
|
|
|
|
+ {
|
|
|
|
|
+ "URL": "https://www.pinterest.com/pin/1234567890/"
|
|
|
|
|
+ }
|
|
|
|
|
+]
|
|
|
|
|
+然后 。GET /datasets/v3/snapshot/{snapshot_id}?format=json
|
|
|
|
|
+返回 JSON(含评论):
|
|
|
|
|
+
|
|
|
|
|
+JSON
|
|
|
|
|
+
|
|
|
|
|
+{
|
|
|
|
|
+ "url": "https://www.pinterest.com/pin/1234567890/",
|
|
|
|
|
+ "post_id": "1234567890",
|
|
|
|
|
+ "comments_num": 85,
|
|
|
|
|
+ "comments": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "comment_id": "c1",
|
|
|
|
|
+ "author_name": "User A",
|
|
|
|
|
+ "author_url": "https://www.pinterest.com/user_a/",
|
|
|
|
|
+ "text": "Love this!",
|
|
|
|
|
+ "date": "2026-02-11T08:00:00Z",
|
|
|
|
|
+ "likes": 10
|
|
|
|
|
+ // 是否有 replies / replies_count,要看当前 schema
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+}
|
|
|
|
|
+“查看更多回复”这一层,目前 Web Scraper API 不能让你自己写脚本去点按钮,只能拿到爬虫内部能抓到的评论层级。
|
|
|
|
|
+
|
|
|
|
|
+4)如果你坚持“只发一次 GET,不想轮询”
|
|
|
|
|
+技术上可以这样做,但有风险:
|
|
|
|
|
+
|
|
|
|
|
+先 得到 ;POST /triggersnapshot_id
|
|
|
|
|
+立刻 一次;GET /snapshot/{id}?format=json
|
|
|
|
|
+不管 status 是不是 ready,都直接用返回内容。
|
|
|
|
|
+问题是:
|
|
|
|
|
+
|
|
|
|
|
+如果任务还没完成,你拿到的可能只是状态信息,而不是完整数据;
|
|
|
|
|
+所以从业务可靠性角度,不建议这么做。
|
|
|
|
|
+总结
|
|
|
|
|
+是的:整个 Web Scraper API 完全是标准 HTTP GET/POST,你可以只用 Node.js 原生 或 curl 来调用;https
|
|
|
|
|
+但:必须先 POST 再 GET,因为采集是异步任务,不可能只靠一个 GET URL 完成所有逻辑;
|
|
|
|
|
+你要的 3 步 Pinterest 工作流,都可以用上面这种“纯 HTTP”方式实现,只是“查看更多回复”这种交互目前不能自定义点击。
|