I am attempting to scrape device information from a specific website (gsmarena) based on its model number. I would like to extract the model name (and eventually price). I'm using a headless browser and rotating proxies to do so, but have had little success in extracting the info. required for ~2000 devices (am able to extract roughly 10 before all IPs blocked).
The (~200) proxies are obtained from https://free-proxy-list.net/, which seem to contain a few that work.
I've explored a number of different options but have had little success. Below is the code I'm currently running- any help would be appreciated.
def get_device_name(device_model, proxies_list):
This function takes in a device model and a list of proxies and returns the device name
based on the device model by making a request to the website using a headless browser
with the rotating proxies.
Parameters:
- device_model (str): The model number of the device
- proxies_list (list): A list of proxies to be used for making requests to the website
Returns:
- str: The device name or None if it could not be obtained
# Check if the device model is provided, return None if not
if device_model is None:
return None
# Create a list for storing working proxies
working_proxies = list(proxies_list)
# Start a web driver instance
attempts = 0
while attempts < len(working_proxies):
proxy = working_proxies[attempts]
print(proxy)
try:
# Set options for the web driver
options = webdriver.ChromeOptions()
options.add_argument(f'--proxy-server={proxy}')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
# Make the request to the website
url = 'https://www.gsmarena.com/res.php3?sSearch=' + device_model
driver.get(url)
# Wait 5 seconds before making another request
time.sleep(5)
# Find the device name on the page
device_name = driver.find_element(By.TAG_NAME, 'strong').text.replace("\n"," ")
driver.close()
print(device_name)
return device_name
except Exception:
# If an exception is raised, increment attempts and remove the failed proxy
attempts += 1
print("Attempt {} with proxy {} failed. Trying again with a different proxy...".format(attempts, proxy))
proxies_list.remove(proxy)
print("Proxy {} removed.".format(proxy))
continue
# Return None if all attempts failed
print("All attempts failed. Unable to get device name.")
return None
Apply the get_device_name function to the device_model column of user_agents_2
user_agents_3 = user_agents_2 user_agents_3['device_name'] = user_agents_2['device_model'].apply(get_device_name, proxies_list=proxies_list)
print(user_agents_3)
Related
As a fun experiment, I decided to scrape data from Google shopping, it works perfectly on my local but on my server it doesn't work. Here is the code
#Web driver file
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options, executable_path="/Users/kevin/Documents/projects/deal_hunt/scraper_scripts/chromedriver")
def get_items(url, category):
driver.get(url)
results = []
content = driver.page_source
soup = BeautifulSoup(content, features="lxml")
#the first will click on all the images of the products that are on sale, that's the only way to generate the class that is going to allow us to
#fetch the data we need
for element in soup.find_all(attrs="i0X6df"):
#not all items are on sale, those that are, have a label, we will only choose those ones
sale_label = element.find('span', {'class': 'Ib8pOd'})
if sale_label is None:
pass
else:
#we want to take the id of the image from the page and dynamically click on it. If we don't do this, selenium will keep clicking on the first picture
parent_div = element.find('div', {'class': 'ArOc1c'})
image_tag = parent_div.find('img')
image_to_click = driver.find_element_by_id(image_tag['id'])
driver.execute_script("arguments[0].click();", image_to_click)
time.sleep(5)
items = driver.find_elements_by_class_name('_-oQ')
for item in items:
image_tag = item.find_element_by_class_name('sh-div__current').get_attribute('src')
description = item.find_element_by_class_name('sh-t__title').get_attribute('text')
link = item.find_element_by_class_name('sh-t__title').get_attribute('href')
store = item.find_element_by_css_selector('._-oA > span').get_attribute('textContent')
price = item.find_elements_by_class_name('_-pX')[0].get_attribute('textContent')
old_price = item.find_elements_by_class_name('_-pX')[1].get_attribute('textContent')
#we only take numbers, because the web page returns a series of weird characters and the price is found at the end of the string
price_array = price.split(',')
price = ''.join(re.findall(r'\d+', price_array[0])) + '.' + price_array[1]
old_price_array = old_price.split(',')
old_price = ''.join(re.findall(r'\d+', old_price_array[0])) + '.' + old_price_array[1]
#remove rand sign
price = price.replace("R ", "")
#replace the comma with the dot
price = price.replace(",", ".")
#we're trying to get the url of the product inside the google url
url_to_parse = link
parsed_url = urlparse(url_to_parse)
product_url = parse_qs(parsed_url.query)['q'][0]
results.append({
'image': image_tag,
'description': description,
'store': store,
'link': product_url,
'price': float(price),
'old_price': float(old_price)
})
#if we successfully scrape data, we print it, otherwise we skip
if len(results) > 0:
print(results)
print("Command has been perfectly executed")
else:
print("There is nothing to add")
when I run python3 main.py on local, it returns that the command has been perfectly executed, but on my Ubuntu server the same command returns immediately "There is nothing to add"
It would be necessary to verify that you have installed the necessary on your server, including the selenium and python versions, also check the path on the server because that driver may not be running.
As an additional recommendation, make checkpoints in the code to validate if from the beginning it is not bringing info or if it is somewhere else that is losing it. Superficially in the code I do not see something strange that could generate the error.
As suggested in another response, you should debug your code and ensure that the requests are identical.
Alternatively, you could try running the spider using containers to avoid any OS particularities. A more escalable option would be to use a cloud-based scraping environment like estela, although it has not been tested, you could try to use Scrapy with Selenium.
The Situation:
I recently started web scraping using selenium and scrapy and i was working on a project where i have a csv file which contains 42 thousand zip codes and my job is to take that zip code and go on this site input the zip code and scrape all the results.
The Problem:
The problem here is that in doing this I have to continuously click the 'load more' button until all the results have been displayed and only once that has finished I can collect the data.
This may not be much of an issue, however it takes 2 minutes to do this per zip code and I have 42 000 to do this with.
The Code:
import scrapy
from numpy.lib.npyio import load
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException, ElementNotInteractableException, ElementNotSelectableException, NoSuchElementException, StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from items import CareCreditItem
from datetime import datetime
import os
from scrapy.crawler import CrawlerProcess
global pin_code
pin_code = input("enter pin code")
class CareCredit1Spider(scrapy.Spider):
name = 'care_credit_1'
start_urls = ['https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty//?Sort=D&Radius=75&Page=1']
def start_requests(self):
directory = os.getcwd()
options = webdriver.ChromeOptions()
options.headless = True
options.add_experimental_option("excludeSwitches", ["enable-logging"])
path = (directory+r"\\Chromedriver.exe")
driver = webdriver.Chrome(path,options=options)
#URL of the website
url = "https://www.carecredit.com/doctor-locator/results/Any-Profession/Any-Specialty/" +pin_code + "/?Sort=D&Radius=75&Page=1"
driver.maximize_window()
#opening link in the browser
driver.get(url)
driver.implicitly_wait(200)
try:
cookies = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')
cookies.click()
except:
pass
i = 0
loadMoreButtonExists = True
while loadMoreButtonExists:
try:
load_more = driver.find_element_by_xpath('//*[#id="next-page"]')
load_more.click()
driver.implicitly_wait(30)
except ElementNotInteractableException:
loadMoreButtonExists = False
except ElementClickInterceptedException:
pass
except StaleElementReferenceException:
pass
except NoSuchElementException:
loadMoreButtonExists = False
try:
previous_page = driver.find_element_by_xpath('//*[#id="previous-page"]')
previous_page.click()
except:
pass
name = driver.find_elements_by_class_name('dl-result-item')
r = 1
temp_list=[]
j = 0
for element in name:
link = element.find_element_by_tag_name('a')
c = link.get_property('href')
yield scrapy.Request(c)
def parse(self, response):
item = CareCreditItem()
item['Practise_name'] = response.css('h1 ::text').get()
item['address'] = response.css('.google-maps-external ::text').get()
item['phone_no'] = response.css('.dl-detail-phone ::text').get()
yield item
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y")
dt = now.strftime("%H-%M-%S")
file_name = dt_string+"_"+dt+"zip-code"+pin_code+".csv"
process = CrawlerProcess(settings={
'FEED_URI' : file_name,
'FEED_FORMAT':'csv'
})
process.crawl(CareCredit1Spider)
process.start()
print("CSV File is Ready")
items.py
import scrapy
class CareCreditItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Practise_name = scrapy.Field()
address = scrapy.Field()
phone_no = scrapy.Field()
The Question:
Essentially my question is simple. Is there a way to optimize this code in order for it to perform faster? Or what are the other potential methods in order to handle scraping this data without it taking forever?
Since the site loads the data dynamically from an api you can retrieve the data directly from the api. This will speed things up quite a bit, but I'd still implement a wait to avoid hitting the rate limit.
import requests
import time
import pandas as pd
zipcode = '00704'
radius = 75
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page=1'
req = requests.get(url)
r = req.json()
data = r['results']
for i in range(2,r['maxPage']+1):
url = f'https://www.carecredit.com/sites/ContentServer?d=&pagename=CCGetLocatorService&Zip={zipcode}&City=&State=&Lat=&Long=&Sort=D&Radius={radius}&PracticePhone=&Profession=&location={zipcode}&Page={i}'
req = requests.get(url)
r = req.json()
data.extend(r['results'])
time.sleep(1)
df = pd.DataFrame(data)
df.to_csv(f'{pd.Timestamp.now().strftime("%d/%m/%Y_%H-%M-%S")}zip-code{zipcode}.csv')
There are multiple ways in which you can do this.
1. Creating a distributed system in which you run the spider through multiple machines in order to run in parallel.
This in my opinio is the better of the options as you can also create a scalable dynamic solution that you will be able to use many times over.
There are many ways of doing this normally it will consist of dividing the seedlist (The Zip Codes) into many separate seedlists in order to have the separate processes working with seperate seedlists, thus the downloads will run in parallel so for example if its on 2 machines it will go 2 times faster, but if on 10 machines its 10 times faster, etc.
In order to do this I might suggest looking into AWS, namely AWS Lambda , AWS EC2 Instances or even AWS Spot Instances these are the ones I have worked wiht previously and they are not terribly hard to work with.
2. Alternatively, if you are wanting to run it on a single machine you can take a look into Multithreading with Python, which can help you run the process in parallel on the singular machine.
3. This is another option particularly if it is a once off process. You can try running it simply with requests which may speed it up but with a massive amount of seeds it usually is faster to develop a process running in parallel.
Here's the situation:
I have a .pac url as proxy. In Ubuntu, the proxy could be use as network proxy been set as automatic mode and fill the .pac url in Configuration URL.
When i use python to crawling from Google Image, the request to google won't work. So i use selenium's chrome webdriver to simulate uses's mouse & keyboard action and its work.
Then i add the '--headless' argument to increase the amount of concurrency, and i got a TimeoutException.
Then i download the .pac file and try to use "options.add_argument('--proxy-pac-url=xxx.pac')" to solve this problem, but the proxy still won't work.
And i got a solution which use a chrome extension called 'SwitchyOmega' to use .pac file proxy.
When i download the latest release from github and use "options.add_extension('xxx/SwitchyOmega_Chromium.crx')" to load the extension, and i got:"from unknown error: CRX verification failed: 3"
At last, i configure SwitchyOmega in chrome and use developer tools pack the local extension file to .crx and the extension was load correctly in webdriver. But i found the extension is unconfigured.
So how can i fix this proxy problem, thanks!
Here is my code:
class GoogleCrawler:
def __init__(self):
driver_executable = self.get_driver_executable()
options = webdriver.ChromeOptions()
options.add_argument('blink-settings=imagesEnabled=false')
# options.add_argument('--headless')
# options.add_argument('--proxy-pac-url=./xxx.pac')
# options.add_extension('./SwitchyOmega_Chromium.crx')
self.browser = webdriver.Chrome(driver_executable,
chrome_options=options)
self.driver_version_check()
def get_google_image_urls(self, keyword):
self.browser.get(f'https://www.google.com/search?q={keyword}&tbm=isch')
time.sleep(2)
img_urls = []
first_thumbnail_image_xpath = '//div[#data-ri="0"]'
image_xpath = '//div[#class="irc_c i8187 immersive-container"]//img[#class="irc_mi"]'
body_element = self.browser.find_element_by_tag_name('body')
wait = WebDriverWait(self.browser, 15)
first_thumbnail_image = wait.until(
element_to_be_clickable((By.XPATH, first_thumbnail_image_xpath)))
first_thumbnail_image.click()
scroll_flag = 0
last_scroll_distance = 0
while scroll_flag <= 50:
image_elements = self.browser.find_elements(By.XPATH, image_xpath)
img_urls.extend([
image_element.get_attribute('src')
for image_element in image_elements
])
body_element.send_keys(Keys.RIGHT)
scroll_distance = self.browser.execute_script(
'return window.pageYOffset;')
if scroll_distance == last_scroll_distance:
scroll_flag += 1
else:
last_scroll_distance = scroll_distance
scroll_flag = 0
self.browser.close()
img_urls = set(img_urls)
print(
f'[INFO]Scraping Image urls DONE: Keyword: {keyword}, Total: {len(img_urls)}'
)
return keyword, img_urls
Since headless Chrome doesn't support PAC files, and since it doesn't support Chrome Extensions, I don't think this were is way to make this work with PAC files for you.
Can you run your own proxy, with logic in that proxy, and pass that to the --proxy-server Chrome flag.
Running into something interesting when trying to set up a Selenium webdriver to scrape fantasy football stats from ESPN. When I execute the following cells in a jupyter notebook I can reach the page I'm looking for (the draft recap page of my fantasy league) and successfully login to my account, accessing the page:
# cell 1
driver = webdriver.Firefox()
driver.get(url)
# cell 2
i = 0
iter_again = True
iframes = driver.find_elements_by_tag_name('iframe')
while i < len(iframes) and iter_again:
driver.switch_to_frame(iframes[i])
if (len(driver.find_elements_by_class_name("input-wrapper"))) > 0:
username, password = driver.find_elements_by_class_name("input-wrapper")
iter_again = False
else:
sleep(1)
driver.switch_to_default_content()
i += 1
# Cell 3
username.find_elements_by_tag_name('input')[0].send_keys(espn_username)
password.find_elements_by_tag_name('input')[0].send_keys(espn_password)
# Cell 4
driver.find_elements_by_tag_name('button')[0].click()
# Cell 5
driver.refresh()
The strange thing though is that when I put all of this in a function and return the webdriver object, espn won't let me log in. I get an error message saying that ESPN is experiencing technical difficulties at this time and I may not be able to login (they're right I can't).
I initially thought this could be some sort of rate limiting thing but can't think of anything different in the HTTP request from the functional form vs. the cell-by-cell approach. For what it's worth, I've tested the functional approach in both a jupyter notebook environment as well as running from a standalone script via the CLI. Any thoughts? All help/feedback is greatly appreciated!
EDIT - Adding the script that doesn't execute properly
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
time import sleep
def get_active_webdriver(url, espn_username, espn_password, headless=False):
driver = webdriver.Firefox()
driver.get(url)
i = 0
iter_again = True
# find iframe with the login info and log in
iframes = driver.find_elements_by_tag_name('iframe')
while i < len(iframes) and iter_again:
driver.switch_to_frame(iframes[i])
if (len(driver.find_elements_by_class_name("input-wrapper"))) > 0:
username, password = driver.find_elements_by_class_name("input-wrapper")
iter_again = False
else:
sleep(1)
driver.switch_to_default_content()
i += 1
username.find_elements_by_tag_name('input')[0].send_keys(espn_username)
password.find_elements_by_tag_name('input')[0].send_keys(espn_password)
driver.find_elements_by_tag_name('button')[0].click()
driver.refresh()
return driver
if __name__ == "__main__":
url = #url here
espn_username = #username
espn_password = #password
driver = get_active_webdriver(url, espn_username, espn_password)
I want to make a python program that gets all links for a certain Google search query so I loop over the 30 search pages and when it gives me a ReCaptcha I do it manually
here is how my code looks like :
driver = webdriver.Firefox()
number_pages = 30
query = 'hello world'
query = urllib.parse.quote_plus(query)
url = "https://www.google.com/search?q="+query+"&&start="
with open('result.txt','w') as fp:
for i in range(1,number_pages-1):
# loop over the 30 pages
page_url = url + str((i-1)*10)
print("# " + page_url)
driver.get(page_url)
while len(driver.find_elements_by_id('recaptcha')) != 0:
# ReCaptcha , sleeping until the user solve the recaptcha
print('sleeping...!')
time.sleep(10)
els = driver.find_elements_by_tag_name('cite')
But when i try to send the recaptcha form it gaves me the error:
Cannot contact reCAPTCHA. Check your connection and try again
and when I use a normal navigator (Google Chrome or Firefox ) the error don't occur
I think the ReCaptcha blocks the webdriver
Please anyone can explain what exact issue here, and how can be fixed.