我正在尝试从谷歌地图中抓取数据。我编写了一个Puppeteer代码,并使用节点JS运行它。我收到了这个错误。
Recieved an error, attempting to move on...
(node:6708) UnhandledPromiseRejectionWarning: TypeError: (intermediate value) is not iterable (cannot read property undefined)
at main (C:\Users\emrah\OneDrive\Desktop\pups\google.js:133:18)
(Use `node --trace-warnings ...` to show where the warning was created)
(node:6708) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not
handled with .catch(). To terminate the node process on unhandled promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode). (rejection id: 1)
(node:6708) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
我的密码在下面。你能帮我解决我缺少的东西吗?谢谢
const puppeteer = require('puppeteer'); /// import puppeteer from "puppeteer";
const defaultDelay = 300; // Increase this if running on a laggy browser or device
let debugBool = true;
let debug = {
log: (...strings) => debugBool && console.log(strings.join(' ')),
};
const xlsx = require('xlsx');
// Get the data
async function getPageData(url, page) {
await page.goto(url);
await page.waitForSelector('[role="main"]').catch(movingOn);
//Shop Name
let shopName =
(await page.$eval('[role="main"]', element =>
element.getAttribute('aria-label')
)) || 'No shop name provided';
//Shop Address
let address =
(await page.$eval(
'button[data-item-id="address"]',
element => element.innerText
)) || 'Delivery service (No address)';
//Website
let website =
(await page.$eval(
'[data-tooltip="Open website"]',
element => element.innerText
)) || 'No website provided';
let returnObj = {
shop: shopName?.trim?.(),
address: address?.trim?.(),
website: website?.trim?.(),
};
console.log(returnObj);
return returnObj;
//await browser.close();
}
//Get Links
async function getLinks(page) {
// Scrolling to bottom of page
let newScrollHeight = 0;
let scrollHeight = 1000;
let divSelector = '#pane > div > div > div > div > div:nth-child(4) > div';
debug.log('Waiting for the page to load in');
await page.waitForTimeout(defaultDelay * 11);
debug.log('Starting to scroll now');
while (true) {
await page.waitForSelector(divSelector).catch();
await page.evaluate(
(scrollHeight, divSelector) =>
document.querySelector(divSelector).scrollTo(0, scrollHeight),
scrollHeight,
divSelector
);
await page.waitForTimeout(defaultDelay);
newScrollHeight = await page.$eval(
divSelector,
div => div.scrollHeight
);
debug.log('scrolled by', newScrollHeight);
if (scrollHeight === newScrollHeight) {
break;
} else {
scrollHeight = newScrollHeight;
}
}
debug.log('finished scrolling');
// Get results
const searchResults = await page.evaluate(() =>
Array.from(document.querySelectorAll('a'))
.map(el => el.href)
.filter(
link =>
link.match(/https:\/\/www.google.com\/maps\//g, link) &&
!link.match(/\=https:\/\/www.google.com\/maps\//g, link)
)
);
console.log(searchResults);
debug.log('I got', searchResults.length, 'results');
return searchResults;
}
async function isNextButtonDisabled(page) {
let state = await page.$eval('button[aria-label=" Next page "]', button =>
button.getAttribute('disabled') ? true : false
);
debug.log(
'We are',
state ? ' at the end of the pages' : 'not at the end of the pages'
);
return state;
}
function movingOn() {
debug.log('Wait timed out, moving on...');
}
function genericMovingOn() {
debug.log('Recieved an error, attempting to move on...');
}
async function main(searchQuery = 'flower shop des moines Iowa') {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://www.google.com/maps/?q=' + searchQuery);
await page
.waitForNavigation({ waitUntil: 'domcontentloaded' })
.catch(movingOn);
await page.waitForTimeout(defaultDelay * 10);
let allLinks = [];
while (!(await isNextButtonDisabled(page).catch(genericMovingOn))) {
// If it hasn't go to the next page
allLinks.push(...(await getLinks(page).catch(genericMovingOn)));
await page
.$eval('button[aria-label=" Next page "]', element =>
element.click()
)
.catch(genericMovingOn);
debug.log('moving to the next page');
if (await isNextButtonDisabled(page).catch(genericMovingOn)) break;
await page
.waitForNavigation({ waitUntil: 'domcontentloaded' })
.catch(movingOn);
}
allLinks = Array.from(new Set(allLinks));
console.log(allLinks);
let scrapedData = [];
for (let i = 0; i < allLinks.length; i++) {
let link = allLinks[i];
let data = await getPageData(link, page).catch(genericMovingOn);
scrapedData.push(data);
}
scrapedData = scrapedData.filter(Boolean)
const wb = xlsx.utils.book_new();
const ws = xlsx.utils.json_to_sheet(scrapedData);
xlsx.utils.book_append_sheet(wb,ws), {origin: -1};
xlsx.writeFile(wb,"flowershop.xlsx");
console.log(scrapedData);
debug.log("Scrape complete!")
}
console.clear();
main();
发布于 2022-05-11 06:29:58
代码中的问题是一些错误的选择器。我修复了它们,还重写了异步/等待方法与然后的链的组合,因为您需要使用其中之一。还有一些事情..。检查在线IDE中的代码
const puppeteer = require("puppeteer"); /// import puppeteer from "puppeteer";
const defaultDelay = 1000; // Increase this if running on a laggy browser or device
const debugBool = true;
const debug = {
log: (...strings) => debugBool && console.log(strings.join(" ")),
};
// const xlsx = require('xlsx');
// Get the data
async function getPageData(url, page) {
await page.goto(url);
try {
await page.waitForSelector('[role="main"]');
} catch (e) {
movingOn();
}
//Shop Name
const shopName = (await page.$eval('[role="main"]', (element) => element.getAttribute("aria-label"))) || "No shop name provided";
//Shop Address
const address = (await page.$eval('button[data-item-id="address"]', (element) => element.innerText)) || "Delivery service (No address)";
//Website
const website = (await page.$eval('[data-tooltip="Open website"]', (element) => element.innerText)) || "No website provided";
const returnObj = {
shop: shopName?.trim(),
address: address?.trim(),
website: website?.trim(),
};
console.log(returnObj);
return returnObj;
//await browser.close();
}
//Get Links
async function getLinks(page) {
// Scrolling to bottom of page
let newScrollHeight = 0;
let scrollHeight = 1000;
let divSelector = "[role='main'] > div:nth-child(2) > div";
debug.log("Waiting for the page to load in");
await page.waitForTimeout(defaultDelay * 11);
debug.log("Starting to scroll now");
while (true) {
try {
await page.waitForSelector(divSelector);
} catch (e) {
movingOn();
}
await page.evaluate((scrollHeight, divSelector) => document.querySelector(divSelector).scrollTo(0, scrollHeight), scrollHeight, divSelector);
await page.waitForTimeout(defaultDelay);
newScrollHeight = await page.$eval(divSelector, (div) => div.scrollHeight);
debug.log("scrolled by", newScrollHeight);
if (scrollHeight === newScrollHeight) {
break;
} else {
scrollHeight = newScrollHeight;
}
}
debug.log("finished scrolling");
// Get results
const searchResults = await page.evaluate(() =>
Array.from(document.querySelectorAll("a"))
.map((el) => el.href)
.filter((link) => link.match(/https:\/\/www.google.com\/maps\//g, link) && !link.match(/\=https:\/\/www.google.com\/maps\//g, link))
);
console.log(searchResults);
debug.log("I got", searchResults.length, "results");
return searchResults;
}
async function isNextButtonDisabled(page) {
const state = await page.$eval('button[aria-label=" Next page "]', (button) => (button.getAttribute("disabled") ? true : false));
debug.log("We are", state ? " at the end of the pages" : "not at the end of the pages");
return state;
}
function movingOn() {
debug.log("Wait timed out, moving on...");
}
function genericMovingOn() {
debug.log("Recieved an error, attempting to move on...");
}
async function main(searchQuery = "flower shop des moines Iowa") {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const [page] = await browser.pages();
await page.goto("https://www.google.com/maps/?q=" + searchQuery);
try {
await page.waitForNavigation({ waitUntil: "domcontentloaded" });
} catch (e) {
movingOn();
}
await page.waitForTimeout(defaultDelay * 10);
let allLinks = [];
let isDisabled;
try {
isDisabled = await isNextButtonDisabled(page);
} catch (e) {
genericMovingOn();
}
while (!isDisabled) {
// If it hasn't go to the next page
try {
const links = await getLinks(page);
allLinks.push(...links);
await page.$eval('button[aria-label=" Next page "]', (element) => element.click());
debug.log("moving to the next page");
} catch (e) {
genericMovingOn();
}
try {
isDisabled = await isNextButtonDisabled(page);
} catch (e) {
genericMovingOn();
}
if (isDisabled) break;
try {
await page.waitForNavigation({ waitUntil: "domcontentloaded" });
} catch (e) {
movingOn();
}
}
allLinks = Array.from(new Set(allLinks));
console.log(allLinks);
let scrapedData = [];
for (let i = 0; i < allLinks.length; i++) {
const link = allLinks[i];
try {
const data = await getPageData(link, page);
scrapedData.push(data);
} catch (e) {
genericMovingOn();
}
}
// scrapedData = scrapedData.filter(Boolean)
// const wb = xlsx.utils.book_new();
// const ws = xlsx.utils.json_to_sheet(scrapedData);
// xlsx.utils.book_append_sheet(wb,ws), {origin: -1};
// xlsx.writeFile(wb,"flowershop.xlsx");
console.log(scrapedData);
debug.log("Scrape complete!");
}
console.clear();
main();
输出:
We are not at the end of the pages
Waiting for the page to load in
Starting to scroll now
scrolled by 1733
scrolled by 2478
scrolled by 3201
scrolled by 3201
finished scrolling
[
'https://www.google.com/maps/place/Flowerama+Des+Moines/data=!4m6!3m5!1s0x87ee980de8926543:0xf2b5d3bed00298a!8m2!3d41.5540183!4d-93.5972285!16s%2Fg%2F1tpll8wj?authuser=0&hl=en&rclk=1',
"https://www.google.com/maps/place/Irene's+Flowers/data=!4m6!3m5!1s0x87ee9939947d3f37:0x240aba7767b59599!8m2!3d41.5995484!4d-93.6507188!16s%2Fg%2F1tvm39xl?authuser=0&hl=en&rclk=1",
'https://www.google.com/maps/place/Boesen+The+Florist/data=!4m6!3m5!1s0x87ee9c1c1c4cb587:0x1e0c4959fbf34f6a!8m2!3d41.6267652!4d-93.6767491!16s%2Fg%2F1td2vg3b?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Boesen+the+Florist/data=!4m6!3m5!1s0x87ee9ed6bee42e7d:0xcb33d46e89c3605a!8m2!3d41.5866929!4d-93.668475!16s%2Fg%2F1v0llbbj?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Nielsen+Flower+Shop+Inc./data=!4m6!3m5!1s0x87ee9e23839eda5d:0x25a5ca69824457d2!8m2!3d41.5960756!4d-93.7379209!16s%2Fg%2F1tf36j12?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee9bfd29b3601d:0xf8a93939390d2233!8m2!3d41.625509!4d-93.652655!16s%2Fg%2F1tmqgrwk?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Wildflower/data=!4m6!3m5!1s0x87ee99ad677ff647:0x1d8781a36a2887a7!8m2!3d41.5854855!4d-93.6541196!16s%2Fg%2F11g1lnqnd2?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/The+Wild+Orchid/data=!4m6!3m5!1s0x87ee9945df550f53:0x370c6279e304dcba!8m2!3d41.5860602!4d-93.6505064!16s%2Fg%2F11h0ykpc9b?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87eea29553b5443f:0xf6786dd336c55bc8!8m2!3d41.5252573!4d-93.6026348!16s%2Fg%2F1tk701jx?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Flowers+By+Anthony/data=!4m6!3m5!1s0x87ee9888be543f7f:0x2910a8c95a70426b!8m2!3d41.5544784!4d-93.6265733!16s%2Fg%2F1tg79lnh?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Vintage+Barn+Floral/data=!4m6!3m5!1s0x87eea267b43c01b5:0xc33f2bd2d21b1ea8!8m2!3d41.5262999!4d-93.6304368!16s%2Fg%2F11g9jj31y4?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87eea20a5c2518ed:0x38621b6cb4ed0a81!8m2!3d41.5424474!4d-93.6430022!16s%2Fg%2F1yfjk2jr0?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/A+%26+T+Floral+Shop/data=!4m6!3m5!1s0x87eea2639a77abf5:0x456db35484bca0f5!8m2!3d41.5310947!4d-93.6260063!16s%2Fg%2F11gr3ftmp9?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Something+Chic+Floral/data=!4m6!3m5!1s0x87ee9e202ee42883:0x40ad7dac8108d5fc!8m2!3d41.5726407!4d-93.7337251!16s%2Fg%2F1trxkv5w?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee90b2f1294c4d:0x64e20c5d311e4337!8m2!3d41.628712!4d-93.5698329!16s%2Fg%2F1tnjk3n4?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Flowerama+West+Des+Moines/data=!4m6!3m5!1s0x87ee9e41fb6edcb9:0x43a12dec9ec234b5!8m2!3d41.6006711!4d-93.7184221!16s%2Fg%2F1tltmy6h?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Adina+Blooms/data=!4m6!3m5!1s0x87ee99a99c40287f:0xa35456bc3228a415!8m2!3d41.626262!4d-93.6318535!16s%2Fg%2F11fr3vyl4d?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee9fcf637d8843:0x86b1d654dc505e76!8m2!3d41.5709196!4d-93.7294863!16s%2Fg%2F12lkj773m?authuser=0&hl=en&rclk=1',
'https://www.google.com/maps/place/Hy-Vee+Floral/data=!4m6!3m5!1s0x87ee980dfef845dd:0x37245f8a02a1948e!8m2!3d41.554758!4d-93.59495!16s%2Fg%2F1hm444xff?authuser=0&hl=en&rclk=1',
"https://www.google.com/maps/place/Sam's+Club+Floral/data=!4m6!3m5!1s0x87ee9ff3278fa543:0x3b9deb8224abde95!8m2!3d41.5983872!4d-93.7140387!16s%2Fg%2F11j0w3m1qw?authuser=0&hl=en&rclk=1"
]
I got 20 results
... other data
{
shop: 'Hy-Vee Grocery Store',
address: '2540 E Euclid Ave, Des Moines, IA 50317, United States',
website: 'https://www.hy-vee.com/stores/detail.aspx?s=48&utm_source=google&utm_medium=organic&utm_campaign=gmb-listing'
} ... other results
或者,您可以使用来自SerpApi的SerpApi来完成它。这是一个有免费计划的付费API。
不同之处在于,您不必弄清楚要使用哪些选择器,然后随着时间的推移维护解析器。您可以立即获得所有信息,只需处理JSON文件即可。看看操场。
用法:
const SerpApi = require("google-search-results-nodejs");
const mySecret = process.env["API_KEY"]; // your API KEY from serpapi.com
const search = new SerpApi.GoogleSearch(mySecret);
const params = {
engine: "google_maps", // search engine
q: "flower shop des moines Iowa", // search query
google_domain: "google.com", // google domain of the search
ll: "@41.6238809, -93.9120425,10z", // GPS Coordinates parameter
type: "search", // type of search parameter
hl: "en", // language of the search
start: 0, //result offset parameter
};
const scrapedData = [];
const getData = function (data) {
const results = data.local_results;
results?.forEach((result) => {
const { title: shop = "No shop name provided", address = "Delivery service (No address)", website = "No website provided" } = result;
const info = {
shop,
address,
website,
};
scrapedData.push(info);
});
if (data.serpapi_pagination?.next) {
params.start += 20;
searchInfo();
} else {
console.log(scrapedData);
}
};
const searchInfo = () => {
search.json(params, getData);
};
searchInfo();
输出:
[
{
shop: 'Flowerama Des Moines',
address: '3310 SE 14th St, Des Moines, IA 50320',
website: 'http://www.1800flowersdesmoines.flowerama.com/'
},
{
shop: "Irene's Flowers",
address: '1151 25th St, Des Moines, IA 50311',
website: 'http://www.dsmflorist.com/'
},
{
shop: 'Boesen The Florist',
address: '3422 Beaver Ave, Des Moines, IA 50310',
website: 'http://www.boesen.com/'
},
...
]
免责声明,我为SerpApi工作。
https://stackoverflow.com/questions/70601311
复制相似问题