伙计们,我这里可能有个棘手的问题。我正在尝试制作一个机器人,它可以下载一个instagram账户的所有照片/视频网址,将它们添加到一个列表中,最后将它们保存到一个文件中。但是当我查看它是否正常工作时,我发现urls列表一直包含51个urls,每次当我在程序运行时添加新的urls时,列表中的这些urls都会随着新的51个urls的变化而变化,最后的urls会从列表中删除,而不是将它们加到现有的urls列表中,然后继续添加新的urls。为什么会发生这样的事情?我需要你们的知识:)
机器人的代码如下:
#Here is the run.py from where I'm running the program
import os
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import autoit
from selenium.webdriver.common.keys import Keys
import requests
import coockies
import PopUpsClose
import login
import link
import url_extraxction
def main():
#Makes an mobile emulator to start Instagram like a smartphone
mobile_emulation = {
"deviceMetrics": { "width": 360, "height": 640, "pixelRatio": 3.0 },
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19" }
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.get('https://www.instagram.com/accounts/login/')
coockies.close_coockies(browser)
login.Insta_login(browser)
PopUpsClose.pop_up(browser)
######################################
#Here it takes the url from the file
url = link.page_link(browser)
browser.get(url)
sleep(5)
#Scrolling down to the page and getting the URLS
url_extraxction.extract(browser, url)
main()Here is the login function
from time import sleep
def Insta_login(browser):
login_file = open(r'C:\Users\bilakos\Desktop\PYTHON_PROJECTS\InstaAutoPhotoUpload\login.txt', 'r')
username = login_file.readline()
while username != '':
password = login_file.readline()
username_ = username.rstrip("\n")
password = password.rstrip("\n")
username = login_file.readline()
sleep(2)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[3]/div/label/input""").send_keys(username_)
browser.find_element_by_xpath("""//*[@id="loginForm"]/div[1]/div[4]/div/label/input""").send_keys(password)
sleep(2)
browser.find_element_by_xpath("""/html/body/div[1]/section/main/div[1]/div/div/div/form/div[1]/div[6]/button/div""").click()
sleep(10)
login_file.close()Here is the coockies function
def close_coockies(browser):
coockies_accept = browser.find_element_by_xpath("""/html/body/div[2]/div/div/div/div[2]/button[1]""")
coockies_accept.click()Here is the PopUpsClose function
from time import sleep
def pop_up(browser):
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 1ο Pop Up
not_now_button = browser.find_element_by_xpath("""/html/body/div[1]/section/main/div/div/div/button""")
not_now_button.click()
sleep(10)
#Εδώ βρίσκει που είναι σημείο για να κλείσει το 2ο Pop Up
not_now_button2 = browser.find_element_by_xpath("""/html/body/div[4]/div/div/div/div[3]/button[2]""")
not_now_button2.click()
sleep(2)在我遇到问题的地方使用And last is the url_extraction function
from time import sleep
import requests
import os
def extract(browser, url):
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 OPR/73.0.3856.329"}
requests.get(url, headers = header)
#SCROLL DOWN
print("This process maybe it will take like 5 minutes.\n", "Don't close the program......")
last_height = 0
proceed = ''
while True:
browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
#GET THE URLS
elements = browser.find_elements_by_xpath('//a[@href]')
links = []
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls)
print(links)
sleep(2)
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
if False:
proceed = False
else:
proceed = True
sleep(10)
#Create a folder with the name of the profile
if proceed == True:
name = browser.find_element_by_class_name("_7UhW9.fKFbl.yUEEX.KV-D4.fDxYl")
text = name.text
print("Wait to create a Folder to pass the extracted links.\nPlease don't close the program.")
print('' * 2)
sleep(5)
path = "C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\" + text
sleep(2)
try:
os.mkdir(path)
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred succesfully to the file.')
except FileExistsError:
print('The file already exist.')
link_extraction = open('C:\\Users\\bilakos\\Desktop\\PYTHON_PROJECTS\\InstaAutoPhotoUpload\\' + text
+ '\\extracted_links.txt', 'w')
sleep(2)
print("The extracted_links.txt file is created.")
print('' * 2)
for i in links:
link_extraction.write(i + '\n')
link_extraction.close()
sleep(2)
print('The links transferred successfully to the file.')在url_extraction函数中,我有一个#GET THE URLS,然后问题就出现了。
发布于 2021-01-11 04:16:42
在while循环中,每次滚动时都会重新定义列表。因此,实际上您只是将最后一个滚动保存到文件中。
def extract(browser, url):
...
while True:
# scroll down
...
links = [] # <--- (1) ---
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loop在(1)处,您正在定义一个新列表。在(2)处,您正在追加到列表中。但是在while循环的下一次迭代中,您将再次在(1)处定义一个新列表,并且以前的项将丢失。
要保留结果,必须在while循环外部定义列表。
def extract(browser, url):
...
links = [] # <--- (1) ---
while True:
# scroll down
...
for elem in elements:
urls = elem.get_attribute('href')
if urls not in links and 'p' in urls.split('/'):
links.append(urls) # <--- (2) ---
print(links)
...
# check if at end and if yes then break out of loophttps://stackoverflow.com/questions/65656059
复制相似问题