Dealing With Ajax Request Python - python

Case :
I am trying to extract number of pages data from a site. I create a filter in the page with the below
code:
fp = webdriver.FirefoxProfile()
fp.set_preference("javascript.enabled", True)
b = webdriver.Firefox(firefox_profile=fp)
b.get(url)
time.sleep(10)
search = b.find_element_by_name("rb")
search.clear()
search.send_keys('dove')
search.send_keys(Keys.ESCAPE)
search.submit()
shampoo_sel = b.find_element_by_id('flt-46')
shampoo_sel.click()
conditioner_sel = b.find_element_by_id('flt-47')
conditioner_sel.click()
time.sleep(5)
search_url = b.current_url
dp = urllib2.urlopen(search_url).read()
dp_soup = BeautifulSoup(dp)
search_page_num = dp_soup.find("li", { "id" : "pagContinue" })
print search_page_num
while i try saving the code with the current URL ( both the URLs before and after the Filter is same and hence unable to get the exact number of pages after filter)
what should I do in this case ???

Related

Downloading data from webpage that has no forms using mechanicalsoup

I'm trying to extract the data from this website -https://www.texmesonet.org/DataProducts/CustomDownloads
I have to fill the data in the text fields and select few options before downloading the data and I'm trying to do it with mechanicalsoup. The roadblock that I'm facing is that the tags corresponding to the parameters needs to download are not inside a form. Is there any way to tackle this using mechanicalsoup? I have pasted the code that I'm using to select the parameters below.
browser = mechanicalsoup.Browser()
url = 'https://www.texmesonet.org/DataProducts/CustomDownloads'
page = browser.get(url)
html_page = page.soup
#print(html_page.select('div'))
region = html_page.select('select')[0]
region.select('option')[0]["value"] = 'Station'
data_type = html_page.select('select')[1]
data_type.select('option')[2]["value"] = 'Timeseries'
#print(html_page.select('span'))
start_date = html_page.find_all("div", {"class": "col50 field-container"})[2]
start_date.select('input')[0]["value"] = '11/28/2022'
end_date = html_page.find_all("div", {"class": "col50 field-container"})[3]
end_date.select('input')[0]["value"] = '12/05/2022'
station = html_page.find_all("div", {"class": "col50 field-container"})[4]
station.select('input')[0]["value"] = 'Headwaters Ranch'
interval = html_page.select('select')[3]
interval.select('option')[0]["value"] = 'Daily'
units = html_page.select('select')[5]
units.select('option')[0]["value"] = 'US / Customary'

IndexError: list index out of range but the card is starting with zero index

I am trying to scrape one web using python. Here are my code:
import selenium
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import sys # using for command Line argument
from selenium.webdriver.chrome.options import Options
# generating url for product
def get_url(product):
product = product.replace(' ','%20')
template = 'https://www.carousell.com.my/search/{}'
url = template.format(product_name)
return url
def get_all_products(card):
product_image = card.find('img','D_iN D_iK D_uf')
product_image = product_image['src']
product_name = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci D_bY
M_cl D_ca M_cn D_ce M_cq D_ch M_ct D_bR').text.strip()
product_price = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci D_bY
M_cl D_ca M_cn D_ce M_cq D_cg M_cs D_bQ').text.strip()
product_summary = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci
D_bY M_cl D_ca M_cn D_ce M_cq D_cg M_cs D_bR').text.strip()
anchor = card.find('a','D_gk M_bj')
product_link = 'https://www.carousell.com.my'+anchor.get('href')
product_info = (product_image, product_name, product_price,
product_summary, product_link)
return product_info
def main(product):
url = get_url(product)
options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('--log-level=3')
driver = `webdriver.Chrome(executable_path='C:\\webDrivers\\chromedriver.exe',options=options)`
driver.get(url)
driver.maximize_window()
time.sleep(5)
soup = BeautifulSoup(driver.page_source,'html.parser')
product_card = soup.find_all('div','D_jb D_ph D_pm M_np')
#fetching single product from Carousell
singleCard = product_card[0]
productDetails = get_all_products(singleCard)
return productDetails
pname = str(sys.argv[1])
scrape_data = main(pname)
print(scrape_data)
I got this error when I try to run using cmd:
File "C:\wamp\www\project\Carousell_Scrap_PHP.py", line 63, in main
singleCard = product_card[0]
IndexError: list index out of range
i have try to use the same code for shopee and it runs wellbut when i try it with another carousell, it show this error. I also tried to find the answer for this error but I can't get it. I would really appreciate if someone can help me with this. I know indexes start from 0 but I already write 0 in the code. It still show indexerror.
You are getting this error since product_card is empty.
This obviously caused by wrong locator div','D_jb D_ph D_pm M_np you are using to get these elements.
I couldn't find clear locator for this element here.
product_card = soup.find_all('div','.D_ow.D_qW.D_rb')
Worked for me
Maybe its because there are spaces in the class name D_jb D_ph D_pm M_np. Use only the 1st part of the class name. In the link for the div tag with class D_ow D_qW D_rb , tried with only D_ow and it returned those 20 items.
driver.get("https://www.carousell.com.my/search/waist%20bag?addRecent=true&canChangeKeyword=true&includeSuggestions=true&searchId=F7LtXr")
soup = BeautifulSoup(driver.page_source,'html.parser')
product_card = soup.find_all('div','D_ow')
print(len(product_card))

Is there a way to concat this base url with list, I've tried literally everything possible for to no avail

I'm trying to get data from several URLs, having the same base URL. This is my code:
path = 'C:/ChromerDriver/chromedriver'
driver = webdriver.Chrome(executable_path=path, options=options)
link_list = ['/organization/4m-carbon-fiber', '/organization/aeropowder' ,'/organization/algiknit']
driver.implicitly_wait(10)
df = pd.DataFrame()
urls = driver.get('https://www.crunchbase.com'.join(link_list))
for url in urls:
try:
name = driver.find_element(By.XPATH, '//span[contains(#class, "profile-name")]').text
website = driver.find_element_by_xpath('//*[#class = "component--field-formatter layout-row layout-align-start-end link-accent ng-star-inserted"]').get_attribute('href')
logo = driver.find_element_by_xpath('/html/body/chrome/div/mat-sidenav-container/mat-sidenav-content/div/ng-component/entity-v2/page-layout/div/div/profile-header/div/header/div/div/div/div[1]/identifier-image/div/img').get_attribute('src')
except:
time.sleep(5)
df = df.append(pd.DataFrame({'name' : name,
'website' : website,
'logo' : logo},
index = [0]), ignore_index = True)
df.to_csv('raw_data.csv', index=False)
How should I combine link_list with the base URL properly?
urls = ['https://www.crunchbase.com' + elem for elem in link_list]
for url in urls:
try:
resp = driver.get(url)
...
Also, place df.append() inside the for-loop if you want to append all results and not only the last one.

URL changes while using proxy and Selenium

I am new to web scraping so please forgive my ignorance.
I built a program to scrape Zillow, and everything has worked fine for the most part. My problem is I am using a proxy service called proxycrawl that easily allows me to integrate proxies into my program. This is done by placing https://api.proxycrawl.com/?token=xxx&url= before my actual URL. What I have noticed is that when the program clicks on an "a" tag, the URL changes to the example below:
Before:
Before Click
After:
After Click
Any 11 clicks through the program or manually result in the site changing to the proxycrawl site, where I get the 404 error. Any ideas?
#Browser open
print(".....Opening Browser.....")
Browser = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver')
Browser.maximize_window()
#browser page
url = urllib.parse.quote_plus('https://www.zillow.com/homes/for_sale/Bakersfield-CA-93312/house,mobile,land,townhouse_type/97227_rid/35.4606,-119.037467,35.317856,-119.200888_rect/12_zm/0_mmm/')
Browser.get('https://api.proxycrawl.com/?token=xxx&url=' + url)
print("Opening Zillow")
time.sleep(10)
last_page = int(Browser.find_element_by_xpath("""//ol[#class="zsg-pagination"]//li[last()-1]""").text)
#print last_page
page = 0
count = 0
csv_file = open('listings.csv','w')
fieldnames = ['address', 'price', 'zestimate', 'beds', 'baths', 'feet', 'desc', 'Type', 'year_built', 'heating', 'cooling', 'parking', 'lot',
'days_on_market', 'pricepsqr', 'saves', 'interior', 'spaces_amenities', 'construction', 'exterior', 'parking1', 'mls', 'other']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for i in range(last_page):
page = page + 1
n = 0
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
for i in range(len(listings)):
n = i + 1
listing_dict = {}
print("Scraping the listing number {0} on page {1}, the count is {2}".format(n, page, count))
if (count) % 11 == 0:
listings = Browser.find_elements_by_xpath('//*[#id="search-results"]/ul/li')
time.sleep(2)
try:
# Finds Listings
listings = Browser.find_elements_by_xpath("""//*[#id="search-results"]/ul/li""")
print("Looking Up listings")
# Opens Listing
listings[i].find_elements_by_tag_name('a')[0].click()
print("Opening Listing")
time.sleep(2)
# Opens "See More Tab"
Browser.find_element_by_partial_link_text('See More').click()
# Prepare for Scrape
time.sleep(2)
I did speak with proxycrawl, and they stated that the URL had to be encoded, which I did do with no luck. After encoding, I replied and got the following statement:
"You are sending your requests double encoded and your get a response of pc_status: 602. Those requests are failing and you should fix them. Please only encode the URLs once, encoding the URLs more than once will result in a failing request."
It look like the page is trying to redirect you relatively.
In this specific use case, you could hack your way around the encoding issue by doing something similar to the following
# https://api.proxycrawl.com/homes/for_sale/Test/one,two
x = driver.current_url
#/homes/for_sale/Test/one,two
r = x[26:]
# base url = https://api.proxycrawl.com/?token=xxx&url=
u = base_url + r
driver.get(u)

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

Categories