I use above code to scrape friend list from facebook UID and am getting an error:
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 15, in friend_uid_list
soup = from_uid(uid)
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 11, in from_uid
driver.get('https://www.facebook.com/' + uid + '/friends')
NameError: name 'driver' is not defined
"""
Can you show me how to fix it ? Thank you very much ! Below code is my code
import multiprocessing
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
def from_uid(uid):
driver.get('https://www.facebook.com/' + uid + '/friends')
return BeautifulSoup(driver.page_source, "html5lib")
def friend_uid_list(uid):
soup = from_uid(uid)
friends = soup.find_all("div", class_="fsl fwb fcb")
target = open('C:/friend_uid_list.txt', 'a')
for href in friends:
href = href.find('a')
try:
target.write(href + "\n")
except:
pass
target.close()
if __name__ == '__main__':
driver = webdriver.Firefox()
driver.get("https://www.facebook.com/")
driver.find_element_by_css_selector("#email").send_keys("myemail#gmail.com")
driver.find_element_by_css_selector("#pass").send_keys("mypass")
driver.find_element_by_css_selector("#u_0_m").click()
pool = multiprocessing.Pool(3)
pool.map(friend_uid_list, [100004159542140,100004159542140,100004159542140])
The reason is simple: You create some new processes, and it can't see the variables in another process(main process).
There are several solutions:
Pass the variables you need as arguments. But this is not possible since driver is not picklable.
Create a new driver for each process.
Use multi-threading instead of multi-processing. However I'm not sure if selenium works this way, you'll have to test it yourself.
Related
The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[#id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
There are multiple ways in which you can do this.
1. Creating a distributed system in which you run the spider through multiple machines in order to run in parallel.
This in my opinio is the better of the options as you can also create a scalable dynamic solution that you will be able to use many times over.
There are many ways of doing this normally it will consist of dividing the seedlist (The Zip Codes) into many separate seedlists in order to have the separate processes working with seperate seedlists, thus the downloads will run in parallel so for example if its on 2 machines it will go 2 times faster, but if on 10 machines its 10 times faster, etc.
In order to do this I might suggest looking into AWS, namely AWS Lambda , AWS EC2 Instances or even AWS Spot Instances these are the ones I have worked wiht previously and they are not terribly hard to work with.
2. Alternatively, if you are wanting to run it on a single machine you can take a look into Multithreading with Python, which can help you run the process in parallel on the singular machine.
3. This is another option particularly if it is a once off process. You can try running it simply with requests which may speed it up but with a massive amount of seeds it usually is faster to develop a process running in parallel.
Hi guys I am trying to work this code to download images from Google. I am helpless at this point because I have tried everything in my power to figure out what is going on and I still don't know what's up. Please have a look at the code below and the error message that I am getting.
The code runs essentially... But the problem is that it opens the browser, scrolls through the page but then the images are not downloaded and I get an error message...
import requests
import time
import urllib
import argparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from fake_useragent import UserAgent
from multiprocessing import Pool
from lxml.html import fromstring
import os, sys
import wget
no=1
def search(url):
# Create a browser
browser = webdriver.Chrome('chromedriver')
browser.implicitly_wait(30)
# Open the link
browser.get(url)
time.sleep(0.5)
element = browser.find_element_by_tag_name("body")
# Scroll down
for i in range(40):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.1)
browser.find_element_by_id("smb").click()
for i in range(10):
element.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
time.sleep(1)
# Get page source and close the browser
source = browser.page_source
browser.close()
return source
def download_image(link):
global no
#print link
# Use a random user agent header
headers = {"User-Agent": ua.random}
# Get the image link
try:
r = requests.get("https://www.google.com" + link.get("href"), headers=headers)
except:
print("Cannot get link.")
title = fromstring(r.content).findtext(".//title")
link_url = title.split(" ")[-1]
print(link_url)
if link_url.find(".jpg")==len(link_url)-4:
# Download the image
wget.download(link_url, str(os.getcwd()) + "/" + query+"/"+str(no)+".jpg")
no=no+1
# set stack limit
sys.setrecursionlimit(1000)
# get user input and search on google
query = input("Enter the name you want to search")
url = "https://www.google.com/search?as_st=y&tbs=isz%3Alt%2Cislt%3Asvga%2Citp%3Aphoto%2Cift%3Ajpg&tbm=isch&sa=1&ei=H_-KW6GSHImGoAS3z4DYCA&q=" +query+"&oq="+query+"&gs_l=img.3..0l10.19389.19389.0.21095.1.1.0.0.0.0.113.113.0j1.1.0....0...1c.1.64.img..0.1.111....0.QpKT5Qs8Kdo"
print(url)
source = search(url)
count=1
# Parse the page source and download pics
page_text = source.encode('utf-8').decode('ascii', 'ignore')
soup = BeautifulSoup(page_text, "html.parser")
ua = UserAgent()
# check directory and create if necessary
if not os.path.isdir(query):
os.makedirs(query)
os.chdir(str(os.getcwd()) + "/" + query)
# get the links
links = soup.find_all("a", class_="rg_l")
for a in links[0:count]:
try:
download_image(a)
except:
pass
and I get this error... I have tried to add browser.implicitly_wait(30) to the code but this method does not work either...
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="smb"]"}
(Session info: chrome=83.0.4103.116)
Could you pleaaaase tell me how to resolve this :( thank you in advance!!
I have created an automation script to place an order, My main objective was to get the order id from the URL. I am storing my whole URL in a string this URL has my order id. Now, how I can only get order id from URL.
This is my URL which I am getting as a string I have also bold out the specific string (order id) which I want:
https://www.fitotouch.com/checkout/orderconfirmedoid=5e4d212fadda911b34f8862c&authCode=NWU0ZDIxMmZhZGRhOTExYjM0Zjg4NjJjOjIwMjAtMDItMTlUMTE6NTE6MDkuNDE2WnqeukQEbFNV8aOXKuJtXpOsH_DpTTGB7zCdbXlrhZOR
Below is my code
import time
import self as self
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# declare variable to store the URL to be visited
base_url="https://www.fitotouch.com"
driver = webdriver.Chrome('E:/Chrome driver/chromedriver.exe')
driver.maximize_window()
#function of our 'driver' object.
driver.implicitly_wait(10) #10 is in seconds
driver.get(base_url)
driver.implicitly_wait(10)
driver.find_element_by_name('password').send_keys("*****")
driver.implicitly_wait(10)
driver.find_element_by_class_name('arrow-icon').click()
driver.implicitly_wait(10)
#FITO NAMES
driver.find_element_by_link_text("FiveSeasons").click()
driver.find_element_by_xpath('/html/body/div[1]/main/article/section/div[2]/div/section/div/div[1]/a').click()
driver.find_element_by_xpath('/html/body/div[1]/main/article/section/div[2]/div/section/article/section[1]/section/div/div[4]/div').click()
time.sleep(2.4)
#driver.find_element_by_xpath('/html/body/div[1]/header/div[3]/div/div[2]/div[3]/div[2]/a/span/svg').click()
driver.get("https://www.fitotouch.com/cart")
time.sleep(2.4)
#driver.execute_script("window.scrollTo(300, 0)")
driver.get("https://www.fitotouch.com/checkout")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[1]/div[2]/form/div[1]/div[1]/input').send_keys("test#gmail.com")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[1]/div[2]/form/div[3]/button').click()
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[1]/div[1]/input').send_keys("test")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[1]/div[2]/input').send_keys("Malik")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[2]/div[1]/div[1]/input').send_keys("port qasim")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[2]/div[4]/input').send_keys("5426")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[2]/div[5]/input').send_keys("karachi")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[2]/div[6]/input').send_keys("sindh")
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[1]/div[3]/input').send_keys("031545454")
time.sleep(2.4)
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[2]/div[2]/form/div[4]').click()
time.sleep(2.4)
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[3]/div[2]/div/form/div[7]').click()
time.sleep(2.4)
driver.find_element_by_xpath('//*[#id="checkout"]/div/div[3]/div/div/div[1]/div[4]/div[2]/div[4]').click()
time.sleep(2.4)
url = driver.current_url
print(url)
url = url.split('=')[1].split('&')[0]
BTW this will work too:
url = url.split('&')[0].split('=')[1]
You can use the Python built-in module re to extract the orderconfirmedoid using multiple delimiters as follows:
Code:
#url = driver.current_url
url = "https://www.fitotouch.com/checkout/orderconfirmedoid=5e4d212fadda911b34f8862c&authCode=NWU0ZDIxMmZhZGRhOTExYjM0Zjg4NjJjOjIwMjAtMDItMTlUMTE6NTE6MDkuNDE2WnqeukQEbFNV8aOXKuJtXpOsH_DpTTGB7zCdbXlrhZOR"
my_order_id = re.split('=|&', url)[1]
print(my_order_id)
Console Output:
5e4d212fadda911b34f8862c
New to multiprocessing! please help.
All libraries are imported, get_links method works, I've tested it on a single case. Trying to make the method run for multiple urls that are designated to parallel processes to make it faster. Without multiprocessing my runtimes are 10 hours +
Edit 2:
Tried my best at a MCVE
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from multiprocessing import Pool
options = Options()
options.headless = True
options.binary_location = 'C:\\Users\\Liam\\AppData\\Local\\Google\\Chrome SxS\\Application\\Chrome.exe'
options.add_argument('--blink-settings=imagesEnabled=false')
options.add_argument('--no-sandbox')
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
subsubarea_urls = []
with open('subsubarea_urls.txt') as f:
for item in f:
item = item.strip()
subsubarea_urls.append(item)
test_urls = subsubarea_urls[:3]
def get_links(url):
driver = webdriver.Chrome('....\Chromedriver', chrome_options=options)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
link = soup.find(class_ = 'listings__all')
if link is not None:
link = "example.com" + link.find('a')['href']
driver.close()
return link
def main():
how_many = 3
p = Pool(processes = how_many)
data = p.map(get_links, test_urls)
p.close()
with open('test_urls.txt', 'w') as f:
f.write(str(data))
if __name__ == '__main__':
main()
Unexpectedly the problem was not anything to do with the code. Multiprocessing in python does not seem to like Windows GUI's the sub processes called by Pool dont have std streams.
The code needs to be executed in IDLE python -m idlelib.idle (To open IDLE)
See Terry Jan Reedy's answer here
I am building a Python script and want to split up certain functions into separate files to make maintenance easier.
I have two files currently called main.py and function1.py
main.pydef
#Setup Imports
import os
import os.path
import sys
# Import Functions
from function1 import myfunction
#Setup Selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
#Launch Firefox
def init_driver():
driver = webdriver.Firefox()
return driver
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
driver = init_driver()
# Init Blank List
checked_urls = []
for url in url_list:
myfunction(driver)
print(checked_urls)
function1.py
def myfunction(driver):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
I am trying to get it to visit each URL in the list and check for This Is My Phrase on the page. If it finds it then it should add to the list.
I am seeing the following error when running the script...
NameError: name 'url' is not defined
I am pretty sure it's related to the way I am importing the separate function but can't work out whats wrong, can anyone help?
You have to also pass url variable to myfunction:
def myfunction(driver, url):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
Then in main file:
for url in url_list:
myfunction(driver, url)
I think some code should be corrected:
Frist, delete the blank space before url_list:
#url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
Then, the url is a local variable, it's not directly accessible in the function myfunction. But it can be accessed as a function parameter:
def myfunction(driver, url):
...