Navigating a website in python, scraping, and posting - python

There are many good resources already on stackoverflow but I'm still having an issue. I've visited these sources:
how to submit query to .aspx page in python
Submitting a post request to an aspx page
Scrapping aspx webpage with Python using BeautifulSoup
http://www.pythonforbeginners.com/cheatsheet/python-mechanize-cheat-sheet
I'm attempting to visit http://www.latax.state.la.us/Menu_ParishTaxRolls/TaxRolls.aspx and select a Parish. I believe this forces a post and allows me to select a year, which posts again, and allows for yet more selection. I've written my script a few different ways following the above sources and haven't successfully been able to submit the site to allow me to enter a year.
My current code
import urllib
from bs4 import BeautifulSoup
import mechanize
headers = [
('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Origin', 'http://www.indiapost.gov.in'),
('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'),
('Content-Type', 'application/x-www-form-urlencoded'),
('Referer', 'http://www.latax.state.la.us/Menu_ParishTaxRolls/TaxRolls.aspx'),
('Accept-Encoding', 'gzip,deflate,sdch'),
('Accept-Language', 'en-US,en;q=0.8'),
]
br = mechanize.Browser()
br.addheaders = headers
url = 'http://www.latax.state.la.us/Menu_ParishTaxRolls/TaxRolls.aspx'
response = br.open(url)
# first HTTP request without form data
soup = BeautifulSoup(response)
# parse and retrieve two vital form values
viewstate = soup.findAll("input", {"type": "hidden", "name": "__VIEWSTATE"})
eventvalidation = soup.findAll("input", {"type": "hidden", "name": "__EVENTVALIDATION"})
formData = (
('__EVENTVALIDATION', eventvalidation[0]['value']),
('__VIEWSTATE', viewstate[0]['value']),
('__VIEWSTATEENCRYPTED',''),
)
try:
fout = open('C:\\GIS\\tmp.htm', 'w')
except:
print('Could not open output file\n')
fout.writelines(response.readlines())
fout.close()
I've also attempted this in the shell and what I entered plus what I received (modified to cut down on the bulk) can be found http://pastebin.com/KAW5VtXp
Anyway I try to change the value in the Parish dropdown list and post I get taken to a webmaster login page.
Am I approaching this the correct way? Any thoughts would be extremely helpful.
Thanks!

I ended up using selenium.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("http://www.latax.state.la.us/Menu_ParishTaxRolls/TaxRolls.aspx")
elem = driver.find_element_by_name("ctl00$ContentPlaceHolderMain$ddParish")
elem.send_keys("TERREBONNE PARISH")
elem.send_keys(Keys.RETURN)
elem = driver.find_element_by_name("ctl00$ContentPlaceHolderMain$ddYear")
elem.send_keys("2013")
elem.send_keys(Keys.RETURN)
elem = driver.find_element_by_id("ctl00_ContentPlaceHolderMain_rbSearchField_1")
elem.click()
APN = 'APN # here'
elem = driver.find_element_by_name("ctl00$ContentPlaceHolderMain$txtSearch")
elem.send_keys(APN)
elem.send_keys(Keys.RETURN)
# Access the PDF
elem = driver.find_element_by_link_text('Generate Report')
elem.click()
elements = driver.find_elements_by_tag_name('a')
elements[1].click()

Related

Accessing href link using BeautifulSoup

I'm trying to scrape the href of the first link titled "BACC B ET A COMPTABILITE CONSEIL". However, I can't seem to extract the href when I'm using BeautifulSoup. Could you please recommend a solution?
Here's the link to the url - https://www.pappers.fr/recherche?q=B+%26+A+COMPTABILITE+CONSEIL&ville=94160
My code:
url = 'https://www.pappers.fr/recherche?q=B+%26+A+COMPTABILITE+CONSEIL&ville=94160'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
resp = requests.get(str(url), headers=headers)
soup = BeautifulSoup(resp.content, 'html.parser')
a = soup.find('div', {'class': 'nom-entreprise'})
print(a)
Result:
None.
The link is constructed dynamically with JavaScript. All you need is a number, which is obtained with Ajax query:
import json
import requests
# url = "https://www.pappers.fr/recherche?q=B+%26+A+COMPTABILITE+CONSEIL&ville=94160"
api_url = "https://api.pappers.fr/v2/recherche"
payload = {
"q": "B & A COMPTABILITE CONSEIL", # <-- your search query
"code_naf": "",
"code_postal": "94160", # <-- this is "ville" from URL
"api_token": "97a405f1664a83329a7d89ebf51dc227b90633c4ba4a2575",
"precision": "standard",
"bases": "entreprises,dirigeants,beneficiaires,documents,publications",
"page": "1",
"par_page": "20",
}
data = requests.get(api_url, params=payload).json()
# uncomment this to print all data (all details):
# print(json.dumps(data, indent=4))
print("https://www.pappers.fr/entreprise/" + data["resultats"][0]["siren"])
Prints:
https://www.pappers.fr/entreprise/378002208
Opening the link will automatically redirects to:
https://www.pappers.fr/entreprise/bacc-b-et-a-comptabilite-conseil-378002208
The website uses is loaded dynamically, therefore requests doesn't support it. We can use Selenium as an alternative to scrape the page.
Install it with: pip install selenium.
Download the correct ChromeDriver from here.
To find the links you can use a CSS selector: a.gros-gros-nom
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
url = "https://www.pappers.fr/recherche?q=B+%26+A+COMPTABILITE+CONSEIL&ville=94160"
driver = webdriver.Chrome()
driver.get(url)
# Wait for the link to be visible on the page and save element to a variable `link`
link = WebDriverWait(driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, "a.gros-gros-nom"))
)
print(link.get_attribute("href"))
driver.quit()
Output:
https://www.pappers.fr/entreprise/bacc-b-et-a-comptabilite-conseil-378002208

Python: Login a website with correct password but return bad 404

Need to login to this webpage but fail. The error return is 404 but I have enter the correct username and password.
import requests
import urllib2
session_requests = requests.session()
login_data = dict(name='john', password='abcdefg')
session_requests.post('https://www.shareinvestor.com/user/login.html', data=login_data)
Is token is needed to login for this webpage? How to do if needed?
1) It returns 404 because they have not set a POST route for main_url/login.html page.
2)Requests package is not a good fit for simulating browser actions. Use mechanize instead
import mechanize
br = mechanize.Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0 ')] #this is my info -))
br.set_handle_robots(False)br.open('https://www.shareinvestor.com/user/login.html?use_https=1')
There are three forms on that page (just print br.forms()), and by default mechanize looks for form name. the login form has not any.
So select it by id (credit goes to that post)
br.select_form(predicate = lambda f: 'id' in f.attrs and f.attrs['id'] == 'sic_login_form_user_login')
# according to this post, forms can be selected by it order
#https://stackoverflow.com/a/2582592/4481312
# so br.select_form(nr=1) is ok and more readable
then fill the form
br.form['name']='foo'
br.form['password']='asdfags'
and finally submit it.
br.submit()
I've not installed mechanize package on my current machine, so I am not able to test the output/process.
The answer as below. I would suggest using selenium as it can handle javacript, rendering and more like a human rather than like a robot that the website can reject it.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
browser = webdriver.Firefox()
browser.get("https://www.shareinvestor.com/user/login.html?use_https=1")
time.sleep(10)
username = browser.find_element_by_id("sic_login_header_username")
password = browser.find_element_by_id("sic_login_header_password")
print "find id done"
username.send_keys("name")
password.send_keys("password")
print "log in done"
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()

how to scrape website in which page url is not changed but the next button add data below the same url page

I have a URL:
http://www.goudengids.be/qn/business/advanced/where/Provincie%20Antwerpen/what/restaurant
On that page there is a "next results" button which loads another 20 data point while still showing first dataset, without updating the URL. I wrote a script to scrape this page in python but it only scrapes the first 22 data point even though the "next results" button is clicked and shows about 40 data.
How can I scrape these types of website that dynamically load content
My script is
import csv
import requests
from bs4 import BeautifulSoup
url = "http://www.goudengids.be/qn/business/advanced/where/Provincie%20Antwerpen/what/restaurant/"
r = requests.get(url)
r.content
soup = BeautifulSoup(r.content)
print (soup.prettify())
g_data2 = soup.find_all("a", {"class": "heading"})
for item in g_data2:
try:
name = item.text
print name
except IndexError:
name = ''
print "No Name found!"
If you were to solve it with requests, you need to mimic what browser does when you click the "Load More" button - it sends an XHR request to the http://www.goudengids.be/q/ajax/business/results.json endpoint, simulate it in your code maintaining the web-scraping session. The XHR responses are in JSON format - no need for BeautifulSoup in this case at all:
import requests
main_url = "http://www.goudengids.be/qn/business/advanced/where/Provincie%20Antwerpen/what/restaurant/"
xhr_url = "http://www.goudengids.be/q/ajax/business/results.json"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
# visit main URL
session.get(main_url)
# load more listings - follow the pagination
page = 1
listings = []
while True:
params = {
"input": "restaurant Provincie Antwerpen",
"what": "restaurant",
"where": "Provincie Antwerpen",
"type": "DOUBLE",
"resultlisttype": "A_AND_B",
"page": str(page),
"offset": "2",
"excludelistingids": "nl_BE_YP_FREE_11336647_0000_1746702_6165_20130000, nl_BE_YP_PAID_11336647_0000_1746702_7575_20139729427, nl_BE_YP_PAID_720348_0000_187688_7575_20139392980",
"context": "SRP * A_LIST"
}
response = requests.get(xhr_url, params=params, headers={
"X-Requested-With": "XMLHttpRequest",
"Referer": main_url
})
data = response.json()
# collect listing names in a list (for example purposes)
listings.extend([item["bn"] for item in data["overallResult"]["searchResults"]])
page += 1
# TODO: figure out exit condition for the while True loop
print(listings)
I've left an important TODO for you - figure out an exit condition - when to stop collecting listings.
Instead of focusing on scraping HTML I think you should look at the JSON that is retrieved via AJAX. I think the JSON is less likely to be changed in the future as opposed to the page's markup. And on top of that, it's way easier to traverse a JSON structure than it is to scrape a DOM.
For instance, when you load the page you provided it hits a url to get JSON at http://www.goudengids.be/q/ajax/business/results.json.
Then it provides some url parameters to query the businesses. I think you should look more into using this to get your data as opposed to scraping the page and simulating button clicks, and etc.
Edit:
And it looks like it's using the headers set from visiting the site initially to ensure that you have a valid session. So you may have to hit the site initially to get the cookie headers and set that for subsequent requests to get the JSON from the endpoint above. I still think this will be easier and more predictable than trying to scrape HTML.

How to scrape aspx pages with python

I am trying to scrape a site, https://www.searchiqs.com/nybro/ (you have to click the "Log In as Guest" to get to the search form. If I search for a Party 1 term like say "Andrew" the results have pagination and also, the request type is POST so the URL does not change and also the sessions time out very quickly. So quickly that if i wait ten minutes and refresh the search url page it gives me a timeout error.
I got started with scraping recently, so I have mostly been doing GET posts where I can decipher the URL. So so far I have realized that I will have to look at the DOM. Using Chrome Tools, I have found the headers. From the Network Tabs, I have also found out the following as the form data that is passed on from the search page to the results page
__EVENTTARGET:
__EVENTARGUMENT:
__LASTFOCUS:
__VIEWSTATE:/wEPaA8FDzhkM2IyZjUwNzg...(i have truncated this for length)
__VIEWSTATEGENERATOR:F92D01D0
__EVENTVALIDATION:/wEdAJ8BsTLFDUkTVU3pxZz92BxwMddqUSAXqb... (i have truncated this for length)
BrowserWidth:1243
BrowserHeight:705
ctl00$ContentPlaceHolder1$scrollPos:0
ctl00$ContentPlaceHolder1$txtName:david
ctl00$ContentPlaceHolder1$chkIgnorePartyType:on
ctl00$ContentPlaceHolder1$txtFromDate:
ctl00$ContentPlaceHolder1$txtThruDate:
ctl00$ContentPlaceHolder1$cboDocGroup:(ALL)
ctl00$ContentPlaceHolder1$cboDocType:(ALL)
ctl00$ContentPlaceHolder1$cboTown:(ALL)
ctl00$ContentPlaceHolder1$txtPinNum:
ctl00$ContentPlaceHolder1$txtBook:
ctl00$ContentPlaceHolder1$txtPage:
ctl00$ContentPlaceHolder1$txtUDFNum:
ctl00$ContentPlaceHolder1$txtCaseNum:
ctl00$ContentPlaceHolder1$cmdSearch:Search
All the ones in caps are hidden. I have also managed to figure out the results structure.
My script thus far is really pathetic as I am completely blank on what to do next. I am still to do the form submission, analyze the pagination and scrape the result but i have absolutely no idea how to proceed.
import re
import urlparse
import mechanize
from bs4 import BeautifulSoup
class DocumentFinderScraper(object):
def __init__(self):
self.url = "https://www.searchiqs.com/nybro/SearchResultsMP.aspx"
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7')]
##TO DO
##submit form
#get return URL
#scrape results
#analyze pagination
if __name__ == '__main__':
scraper = DocumentFinderScraper()
scraper.scrape()
Any help would be dearly appreciated
I disabled Javascript and visited https://www.searchiqs.com/nybro/ and the form looks like this:
As you can see the Log In and Log In as Guest buttons are disabled. This will make it impossible for Mechanize to work because it can not process Javascript and you won't be able to submit the form.
For this kind of problems you can use Selenium, that will simulate a full Browser with the disadvantage of being slower than Mechanize.
This code should log you in using Selenium:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
usr = ""
pwd = ""
driver = webdriver.Firefox()
driver.get("https://www.searchiqs.com/nybro/")
assert "IQS" in driver.title
elem = driver.find_element_by_id("txtUserID")
elem.send_keys(usr)
elem = driver.find_element_by_id("txtPassword")
elem.send_keys(pwd)
elem.send_keys(Keys.RETURN)

How to get contents of frames automatically if browser does not support frames + can't access frame directly

I am trying to automatically download PDFs from URLs like this to make a library of UN resolutions.
If I use beautiful soup or mechanize to open that URL, I get "Your browser does not support frames" -- and I get the same thing if I use the copy as curl feature in chrome dev tools.
The standard advice for the "Your browser does not support frames" when using mechanize or beautiful soup is to follow the source of each individual frame and load that frame. But if I do so, I get to an error message that the page is not authorized.
How can I proceed? I guess I could try this in zombie or phantom but I would prefer to not use those tools as I am not that familiar with them.
Okay, this was an interesting task to do with requests and BeautifulSoup.
There is a set of underlying calls to un.org and daccess-ods.un.org that are important and set relevant cookies. This is why you need to maintain requests.Session() and visit several urls before getting access to the pdf.
Here's the complete code:
import re
from urlparse import urljoin
from bs4 import BeautifulSoup
import requests
BASE_URL = 'http://www.un.org/en/ga/search/'
URL = "http://www.un.org/en/ga/search/view_doc.asp?symbol=A/RES/68/278"
BASE_ACCESS_URL = 'http://daccess-ods.un.org'
# start session
session = requests.Session()
response = session.get(URL, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'})
# get frame links
soup = BeautifulSoup(response.text)
frames = soup.find_all('frame')
header_link, document_link = [urljoin(BASE_URL, frame.get('src')) for frame in frames]
# get header
session.get(header_link, headers={'Referer': URL})
# get document html url
response = session.get(document_link, headers={'Referer': URL})
soup = BeautifulSoup(response.text)
content = soup.find('meta', content=re.compile('URL='))['content']
document_html_link = re.search('URL=(.*)', content).group(1)
document_html_link = urljoin(BASE_ACCESS_URL, document_html_link)
# follow html link and get the pdf link
response = session.get(document_html_link)
soup = BeautifulSoup(response.text)
# get the real document link
content = soup.find('meta', content=re.compile('URL='))['content']
document_link = re.search('URL=(.*)', content).group(1)
document_link = urljoin(BASE_ACCESS_URL, document_link)
print document_link
# follow the frame link with login and password first - would set the important cookie
auth_link = soup.find('frame', {'name': 'footer'})['src']
session.get(auth_link)
# download file
with open('document.pdf', 'wb') as handle:
response = session.get(document_link, stream=True)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
You should probably extract separate blocks of code into functions to make it more readable and reusable.
FYI, all of this could be more easily done through the real browser with the help of selenium of Ghost.py.
Hope that helps.

Categories