我正在尝试使用apify sdk抓取页面内容。这也可以与以下代码很好地配合使用。但是我怎么才能像puppeteer.launch一样强制使用Apify SDK的无头模式({ headless : true})?
供您参考的代码:
async function scrape(number) {
let output = { links: [], title: [], content: [] };
const URL = "https://somepage/";
process.env.APIFY_LOCAL_STORAGE_DIR = '/someappfolder/apify_storage/run_' + number;
const requestQueue = await Apify.openRequestQueue(number);
await requestQueue.addRequest({ url: URL });
const pseudoUrls = [new Apify.PseudoUrl(URL + "[.*]")];
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
handlePageFunction: async ({ request, page }) => {
output.links.push(request.url);
output.title.push(await page.title());
output.content.push((await page.content()).length);
var save = { url: request.url, title: await page.title(), content: (await page.content()).length };
//sendToAirtable(save);
console.log(`URL: ${request.url}`);
await Apify.utils.enqueueLinks({
page,
selector: 'a',
pseudoUrls,
requestQueue,
});
},
maxRequestsPerCrawl: 10,
maxConcurrency: 10,
minConcurrency: 2,
});
await crawler.run();
return output;
};
发布于 2020-10-12 20:00:30
在与requestQueue
https://sdk.apify.com/docs/typedefs/launch-puppeteer-options#docsNav相同的级别上添加launchPuppeteerOptions: { headless: true }
发布于 2020-10-12 20:03:40
process.env.APIFY_HEADLESS = 1;
是我在寻找了几个小时后偶然发现的答案...https://sdk.apify.com/docs/guides/environment-variables#apify_headless
发布于 2020-10-14 22:46:22
您可以像这样将headless选项添加到launchPuppeteerOptions
:
const crawler = new Apify.PuppeteerCrawler({
requestQueue,
launchPuppeteerOptions: {
headless: true,
ignoreHTTPSErrors: true,
// slowMo: 500,
},
maxRequestsPerCrawl: settings.maxurls,
maxConcurrency: settings.maxcrawlers,
https://stackoverflow.com/questions/64317261
复制相似问题