Downloading data from webpage that has no forms using mechanicalsoup - python

I'm trying to extract the data from this website -https://www.texmesonet.org/DataProducts/CustomDownloads
I have to fill the data in the text fields and select few options before downloading the data and I'm trying to do it with mechanicalsoup. The roadblock that I'm facing is that the tags corresponding to the parameters needs to download are not inside a form. Is there any way to tackle this using mechanicalsoup? I have pasted the code that I'm using to select the parameters below.
browser = mechanicalsoup.Browser()
url = 'https://www.texmesonet.org/DataProducts/CustomDownloads'
page = browser.get(url)
html_page = page.soup
#print(html_page.select('div'))
region = html_page.select('select')[0]
region.select('option')[0]["value"] = 'Station'
data_type = html_page.select('select')[1]
data_type.select('option')[2]["value"] = 'Timeseries'
#print(html_page.select('span'))
start_date = html_page.find_all("div", {"class": "col50 field-container"})[2]
start_date.select('input')[0]["value"] = '11/28/2022'
end_date = html_page.find_all("div", {"class": "col50 field-container"})[3]
end_date.select('input')[0]["value"] = '12/05/2022'
station = html_page.find_all("div", {"class": "col50 field-container"})[4]
station.select('input')[0]["value"] = 'Headwaters Ranch'
interval = html_page.select('select')[3]
interval.select('option')[0]["value"] = 'Daily'
units = html_page.select('select')[5]
units.select('option')[0]["value"] = 'US / Customary'

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))
I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

Fill in internet form using a pandas dataframe and mechanicalsoup

I am using mechanicalsoup and I need to auto-fill the internet form using information from a dataframe, automatically.
The dataframe is called checkdataframe.
br = mechanicalsoup.StatefulBrowser(user_agent='MechanicalSoup')
for row in checkdataframe.codenumber:
if row in '105701':
url = "https://www.internetform.com"
br.open(url)
for column in checkdataframe[['ID', 'name','email']]:
br.select_form('form[action="/internetform.com/info.do"]')
br.form['companycode'] =checkdataframe['ID'] #THIS INFORMATION SHOULD COMING FROM DATAFRAME
br.form['username'] = checkdataframe['name'] #THIS INFORMATION SHOULD COMING FROM DATAFRAME
br.form['emailaddress'] = checkdataframe['email'] #THIS INFORMATION SHOULD COMING FROM DATAFRAME
response = br.submit_selected()
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('div', attrs = {'class':'row'})
for row in table.findAll('div', attrs = {'class':'col-md-4 col-4'}):
scrapeinfo = {}
scrapeinfo['STATUS'] = row.div
scrapeinfo['NAMEOFITEM'] = row.label
scrapeinfo['PRICE'] = row.div
checkdataframe.append(scrapeinfo)
else:
break
How can I make br.form['companycode'] =checkdataframe['ID'] this work, instead of
br.form['companycode'] = '105701'
br.form['username'] = 'myusername'
br.form['emailaddress'] = 'myusername#gmail.com'
I need to append the information that is scrape into the checkdataframe.
I need help, please.
Use selenium for this activity

Dynamic Web Scraping with Selenium

I was trying to scrape data from amazon using selenium and beautiful soup..
I have scraped and obtained data from the first page and have a defined a function for it and managed to get the second page opened with the Click() method...
The soup objects that were used in first page is similar to the objects in second page...I am planning to scrape data till page 6....
Was wondering if i could apply the function defined for the first page to the next 5 pages and append the data, which can later be exported as csv.
Any suggestions regarding this would be appreciated..
def data_collection():
title = soup.find_all(name = "span", class_ = "a-size-base-plus a-color-
base a-text-normal")
all_specs = [specs.getText() for specs in title]
brands = [items.split(' ', 1)[0] for items in all_specs] #Brand
phones = [text.split(')')[0].split('(') for text in all_specs]
spec = []
for i in phones:
for j in i:
spec.append(j)
model = spec[::2] #Model
specifications = spec[1::2] #Specs
s_price_obj = soup.find_all(name = "span", class_ = "a-price-whole")
selling_price = [price.getText() for price in s_price_obj] #Price
review_obj = soup.find_all(name = "span", class_ = "a-icon-alt")
review = [ratings.getText() for ratings in review_obj]
review = review[:24] #Ratings
quantity_obj = soup.find_all(name = "span", class_ = "a-size-base")
quantity_sold = [items.getText() for items in quantity_obj]
quantity_sold = quantity_sold[:24] #Quantity Sold
page_number = ['1']*24 #Page Number
Date = date.today()
Date = [str(Date)]*24 #Date
data = [brands, model, specifications, selling_price, review,
quantity_sold, page_number, Date]
return data
The above is the function defined...Open to suggestions
You can try the following:-
Re-define your data_collection method to accept page source parsed by BeautifulSoup
def data_collection(soup):
title = soup.find_all(name = "span", class_ = "a-size-base-plus a-color- base a-text-normal")
all_specs = [specs.getText() for specs in title]
brands = [items.split(' ', 1)[0] for items in all_specs] #Brand
phones = [text.split(')')[0].split('(') for text in all_specs]
spec = []
for i in phones:
for j in i:
spec.append(j)
model = spec[::2] #Model
specifications = spec[1::2] #Specs
s_price_obj = soup.find_all(name = "span", class_ = "a-price-whole")
selling_price = [price.getText() for price in s_price_obj] #Price
review_obj = soup.find_all(name = "span", class_ = "a-icon-alt")
review = [ratings.getText() for ratings in review_obj]
review = review[:24] #Ratings
quantity_obj = soup.find_all(name = "span", class_ = "a-size-base")
quantity_sold = [items.getText() for items in quantity_obj]
quantity_sold = quantity_sold[:24] #Quantity Sold
page_number = ['1']*24 #Page Number
Date = date.today()
Date = [str(Date)]*24 #Date
data = [brands, model, specifications, selling_price, review,
quantity_sold, page_number, Date]
return data
Then loop through each page, get the page source, parse it using BeautifulSoup and pass it to the data_collection function. Example:-
#from page(1..6)
for i in range(1,7):
#change page=i in the url to iterate through the pages
url=f'https://www.amazon.in/s?k=mobile+phones&page={i}&qid=1632394501&ref=sr_pg_2'
driver.get(url)
#get current page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
#call data_collection function
data=data_collection(soup)
#code to append data to csv

BeautifulSoup get links and info inside of them

I would like to scrape a website. Website has 10 preview of complaints in each page. I wrote this script to get links of 10 complaints and some info inside of each link. When I run the script I got this error message "RecursionError: maximum recursion depth exceeded".
Can someone say to me what is the problem. Thank you in advance!!
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
# Create list objects for each information section
C_date = []
C_title = []
C_text = []
U_name = []
U_id = []
C_count = []
R_name = []
R_date = []
R_text = []
# Get 10 links for preview of complaints
def getLinks(url):
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
c_containers = html_soup.find_all('div', class_='media')
# Store wanted links in a list
allLinks = []
for link in c_containers:
find_tag = link.find('a')
find_links = find_tag.get('href')
full_link = "".join((url, find_links))
allLinks.append(full_link)
# Get total number of links
print(len(allLinks))
return allLinks
def GetData(Each_Link):
each_complaint_page = get(Each_Link)
html_soup = BeautifulSoup(each_complaint_page.text, 'html.parser')
# Get date of complaint
dt = html_soup.main.find('span')
date = dt['title']
C_date.append(date)
# Get Title of complaint
TL = html_soup.main.find('h1', {'class': 'title'})
Title = TL.text
C_title.append(Title)
# Get main text of complaint
Tx = html_soup.main.find('div', {'class': 'description'})
Text = Tx.text
C_text.append(Text)
# Get user name and id
Uname = html_soup.main.find('span', {'class': 'user'})
User_name = Uname.span.text
User_id = Uname.attrs['data-memberid']
U_name.append(User_name)
U_id.append(User_id)
# Get view count of complaint
Vcount = html_soup.main.find('span', {'view-count-detail'})
View_count = Vcount.text
C_count.append(View_count)
# Get reply for complaint
Rpnm = html_soup.main.find('h4', {'name'})
Reply_name = Rpnm.next
R_name.append(Reply_name)
# Get reply date
Rpdt = html_soup.main.find('span', {'date-tips'})
Reply_date = Rpdt.attrs['title']
R_date.append(Reply_date)
# Get reply text
Rptx = html_soup.main.find('p', {'comment-content-msg company-comment-msg'})
Reply_text = Rptx.text
R_text.append(Reply_text)
link_list = getLinks('https://www.sikayetvar.com/arcelik')
for i in link_list:
z = GetData(i)
print(z)
PS: My next step will be to put all information in a data frame
Your GetData() method calls itself, with no base-case: this causes infinite recursion:
def GetData(data):
for i in GetData(data):
You're also calling response = get(i) but then ignoring the result... perhaps you meant to say
def GetData(link):
i = get(link)
...

Dealing With Ajax Request Python

Case :
I am trying to extract number of pages data from a site. I create a filter in the page with the below
code:
fp = webdriver.FirefoxProfile()
fp.set_preference("javascript.enabled", True)
b = webdriver.Firefox(firefox_profile=fp)
b.get(url)
time.sleep(10)
search = b.find_element_by_name("rb")
search.clear()
search.send_keys('dove')
search.send_keys(Keys.ESCAPE)
search.submit()
shampoo_sel = b.find_element_by_id('flt-46')
shampoo_sel.click()
conditioner_sel = b.find_element_by_id('flt-47')
conditioner_sel.click()
time.sleep(5)
search_url = b.current_url
dp = urllib2.urlopen(search_url).read()
dp_soup = BeautifulSoup(dp)
search_page_num = dp_soup.find("li", { "id" : "pagContinue" })
print search_page_num
while i try saving the code with the current URL ( both the URLs before and after the Filter is same and hence unable to get the exact number of pages after filter)
what should I do in this case ???

Categories