Python mechanize javascript - python

I'm trying to use mechanize to grab prices for New York's metro-north railroad from this site:
http://as0.mta.info/mnr/fares/choosestation.cfm
The problem is that when you select the first option, the site uses javascript to populate your list of possible destinations. I have written equivalent code in python, but I can't seem to get it all working. Here's what I have so far:
import mechanize
import cookielib
from bs4 import BeautifulSoup
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open("http://as0.mta.info/mnr/fares/choosestation.cfm")
br.select_form(name="form1")
br.form.set_all_readonly(False)
origin_control = br.form.find_control("orig_stat", type="select")
origin_control_list = origin_control.items
origin_control.value = [origin_control.items[0].name]
destination_control_list = reFillList(0, origin_control_list)
destination_control = br.form.find_control("dest_stat", type="select")
destination_control.items = destination_control_list
destination_control.value = [destination_control.items[0].name]
response = br.submit()
response_text = response.read()
print response_text
I know I didn't give you code for the reFillList() method, because it's long, but assume it correctly creates a list of mechanize.option objects. Python doesn't complain about me about anything, but on submit I get the html for this alert:
"Fare information for travel between two lines is not available on-line. Please contact our Customer Information Center at 511 and ask to speak to a representative for further information."
Am I missing something here? Thanks for all the help!

If you know the station IDs, it is easier to POST the request yourself:
import mechanize
import urllib
post_url = 'http://as0.mta.info/mnr/fares/get_fares.cfm'
orig = 295 #BEACON FALLS
dest = 292 #ANSONIA
params = urllib.urlencode({'dest_stat':dest, 'orig_stat':orig })
rq = mechanize.Request(post_url, params)
fares_page = mechanize.urlopen(rq)
print fares_page.read()
If you have the code to find the list of destination IDs for a given starting ID (i.e. a variant of refillList()), you can then run this request for each combination:
import mechanize
import urllib, urllib2
from bs4 import BeautifulSoup
url = 'http://as0.mta.info/mnr/fares/choosestation.cfm'
post_url = 'http://as0.mta.info/mnr/fares/get_fares.cfm'
def get_fares(orig, dest):
params = urllib.urlencode({'dest_stat':dest, 'orig_stat':orig })
rq = mechanize.Request(post_url, params)
fares_page = mechanize.urlopen(rq)
print(fares_page.read())
pool = BeautifulSoup(urllib2.urlopen(url).read())
#let's keep our stations organised
stations = {}
# dict by station id
for option in pool.find('select', {'name':'orig_stat'}).findChildren():
stations[option['value']] = {'name':option.string}
#iterate over all routes
for origin in stations:
destinations = get_list_of_dests(origin) #use your code for this
stations[origin]['dests'] = destinations
for destination in destinations:
print('Processing from %s to %s' % (origin, destination))
get_fares(origin, destination)

Related

Scraping with Beautiful Soup does not update values properly

I try to web-scrape weather website but the data does not update properly. The code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
while True:
soup = BeautifulSoup(urlopen(url), 'html.parser')
data = soup.find("div", {"class": "weather__text"})
print(data.text)
I am looking at 'WIND & WIND GUST' in 'CURRENT CONDITIONS' section. It prints the first values correctly (for example 1.0 / 2.2 mph) but after that the values update very slowly (at times 5+ minutes pass by) even though they change every 10-20-30 seconds in the website.
And when the values update in Python they are still different from the current values in the website.
You could try this alternate method: since the site actually retrieves the data from another url, you could just directly make the request and scrape the site only every hour or so to update the request url.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
from datetime import datetime, timedelta
#def getReqUrl...
reqUrl = getReqUrl()
prevTime, prevAt = '', datetime.now()
while True:
ures = json.loads(urlopen(reqUrl).read())
if 'observations' not in asd:
reqUrl = getReqUrl()
ures = json.loads(urlopen(reqUrl).read())
#to see time since last update
obvTime = ures['observations'][0]['obsTimeUtc']
td = (datetime.now() - prevAt).seconds
wSpeed = ures['observations'][0]['imperial']['windSpeed']
wGust = ures['observations'][0]['imperial']['windGust']
print('',end=f'\r[+{td}s -> {obvTime}]: {wGust} ° / {wSpeed} °mph')
if prevTime < obvTime:
prevTime = obvTime
prevAt = datetime.now()
print('')
Even when making the request directly, the "observation time" in the retrieved data jumps around sometimes, which is why I'm only printing on a fresh line when obvTime increases - without that, it looks like this. (If that's preferred you can just print normally without the '',end='\r... format, and the second if block is no longer necessary either).
The first if block is for refreshing the reqUrl (because it expires after a while), which is when I actually scrape the wunderground site, because the url is inside one of their script tags:
def getReqUrl():
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
soup = BeautifulSoup(urlopen(url), 'html.parser')
appText = soup.select_one('#app-root-state').text
nxtSt = json.loads(appText.replace('&q;','"'))['wu-next-state-key']
return [
ns for ns in nxtSt.values()
if 'observations' in ns['value'] and
len(ns['value']['observations']) == 1
][0]['url'].replace('&a;','&')
or, since I know how the url starts, more simply like:
def getReqUrl():
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
soup = BeautifulSoup(urlopen(url), 'html.parser')
appText = soup.select_one('#app-root-state').text
rUrl = 'https://api.weather.com/v2/pws/observations/current'
rUrl = rUrl + appText.split(rUrl)[1].split('&q;')[0]
return rUrl.replace('&a;','&')
try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
session = requests.Session()
r = session.get(url, timeout=30, headers=headers) # print(r.status_code)
soup = BeautifulSoup(r.content, 'html.parser')
#'WIND & WIND GUST' in 'CURRENT CONDITIONS' section
wind_gust = [float(i.text) for i in soup.select_one('.weather__header:-soup-contains("WIND & GUST")').find_next('div', class_='weather__text').select('span.wu-value-to')]
print(wind_gust)
[1.8, 2.2]
wind = wind_gust[0]
gust = wind_gust[1]
print(wind)
1.8
print(gust)
2.2

Scraping a website with a particular format using Python

I am trying to use Python to scrape the US News Ranking for universities, and I'm struggling. I normally use Python "requests" and "BeautifulSoup".
The data is here:
https://www.usnews.com/education/best-global-universities/rankings
Using right click and inspect shows a bunch of links and I don't even know which one to pick. I followed an example from the web that I found but it just gives me empty data:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import math
from lxml.html import parse
from io import StringIO
url = 'https://www.usnews.com/education/best-global-universities/rankings'
urltmplt = 'https://www.usnews.com/education/best-global-universities/rankings?page=2'
css = '#resultsMain :nth-child(1)'
npage = 20
urlst = [url] + [urltmplt + str(r) for r in range(2,npage+1)]
def scrapevec(url, css):
doc = parse(StringIO(url)).getroot()
return([link.text_content() for link in doc.cssselect(css)])
usng = []
for u in urlst:
print(u)
ts = [re.sub("\n *"," ", t) for t in scrapevec(u,css) if t != ""]
This doesn't work as t is an empty array.
I'd really appreciate any help.
The MWE you posted is not working at all: urlst is never defined and cannot be called. I strongly suggest you to look for basic scraping tutorials (with python, java, etc.): there is plenty and in general is a good starting.
Below you can find a snippet of a code that prints the universities' names listed on page 1 - you'll be able to extend the code to all the 150 pages through a for loop.
import requests
from bs4 import BeautifulSoup
newheaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64)'
}
baseurl = 'https://www.usnews.com/education/best-global-universities/rankings'
page1 = requests.get(baseurl, headers = newheaders) # change headers or get blocked
soup = BeautifulSoup(page1.text, 'lxml')
res_tab = soup.find('div', {'id' : 'resultsMain'}) # find the results' table
for a,univ in enumerate(res_tab.findAll('a', href = True)): # parse universities' names
if a < 10: # there are 10 listed universities per page
print(univ.text)
Edit: now the example works, but as you say in your question, it only returns empty lists. Below an edited version of the code that returns a list of all universities (pp. 1-150)
import requests
from bs4 import BeautifulSoup
def parse_univ(url):
newheaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64)'
}
page1 = requests.get(url, headers = newheaders) # change headers or get blocked
soup = BeautifulSoup(page1.text, 'lxml')
res_tab = soup.find('div', {'id' : 'resultsMain'}) # find the results' table
res = []
for a,univ in enumerate(res_tab.findAll('a', href = True)): # parse universities' names
if a < 10: # there are 10 listed universities per page
res.append(univ.text)
return res
baseurl = 'https://www.usnews.com/education/best-global-universities/rankings?page='
ll = [parse_univ(baseurl + str(p)) for p in range(1, 151)] # this is a list of lists
univs = [item for sublist in ll for item in sublist] # unfold the list of lists
Re-edit following QHarr suggestion (thanks!) - same output, shorter and more "pythonic" solution
import requests
from bs4 import BeautifulSoup
def parse_univ(url):
newheaders = {
'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64)'
}
page1 = requests.get(url, headers = newheaders) # change headers or get blocked
soup = BeautifulSoup(page1.text, 'lxml')
res_tab = soup.find('div', {'id' : 'resultsMain'}) # find the results' table
return [univ.text for univ in res_tab.select('[href]', limit=10)]
baseurl = 'https://www.usnews.com/education/best-global-universities/rankings?page='
ll = [parse_univ(baseurl + str(p)) for p in range(1, 151)] # this is a list of lists
univs = [item for sublist in ll for item in sublist]

Get data from next page after providing Search Parameters using Mechanize and BeautifulSoup

I want to crawl data in
http://dnre-mrne.gnb.ca/MineralOccurrence/default.aspx
From Browser, it's possible to get the tabular results (paginated) by providing "%" in reference number.
Following is my code snippet:
import re
import mechanize
from bs4 import BeautifulSoup
class MineralDBScraper(object):
def __init__(self):
self.url = "http://dnre-mrne.gnb.ca/MineralOccurrence/default.aspx"
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7')]
self.br.set_handle_redirect(True)
self.br.set_handle_robots(False)
def select_form(self,form):
return form.attrs.get('id', None) == 'MainForm'
def scrape_state_firms(self, state_item):
self.br.open(self.url)
s = BeautifulSoup(self.br.response().read())
saved_form = s.find('form', id='MainForm').prettify()
self.br.select_form(predicate=self.select_form)
self.br.form['ctl00$txtURN'] = state_item
self.br.form.fixup()
ctl = self.br.form.find_control('ctl00$reset1')
self.br.form.controls.remove(ctl)
self.br.submit()
print self.br.response().read()
def scrape(self):
print 'Scraping all reference numbers by %'
self.scrape_state_firms('%')
if __name__ == '__main__':
scraper = MineralDBScraper()
scraper.scrape()
After I run this code, my expecation is to get the tabular-data set according to the search parameter '%'.
But, I am getting the previous data i.e. the landing page where search parameters are available.
Please help me out. Am I missing something here?

Trouble extracting data from html-doc with BeautifulSoup

I'm trying to extract data from a page I scraped off the web and I find it to be quite difficult. I tried soup.get_Text(), but its no good since it just returns single chars in a row instead of whole string objects.
Extracting the name is easy, because you can access it with the 'b'-tag, but for example extracting the street ("Am Vogelwäldchen 2") proves to be quite difficult. I could try to assemble the adress from single chars, but this seems overly complicated and I feel there has to be an easier way of doing this. Maybe someone has a better idea. Oh and don't mind the weird function, I returned the soup because I tried different methods on it.
import urllib.request
import time
from bs4 import BeautifulSoup
#Performs a HTTP-'POST' request, passes it to BeautifulSoup and returns the result
def doRequest(request):
requestResult = urllib.request.urlopen(request)
soup = BeautifulSoup(requestResult)
return soup
def getContactInfoFromPage(page):
name = ''
straße = ''
plz = ''
stadt = ''
telefon = ''
mail = ''
url = ''
data = [
#'Name',
#'Straße',
#'PLZ',
#'Stadt',
#'Telefon',
#'E-Mail',
#'Homepage'
]
request = urllib.request.Request("http://www.altenheim-adressen.de/schnellsuche/" + page)
request.add_header("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")
request.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0")
soup = doRequest(request)
#Save Name to data structure
findeName = soup.findAll('b')
name = findeName[2]
name = name.string.split('>')
data.append(name)
return soup
soup = getContactInfoFromPage("suche2.cfm?id=267a0749e983c7edfeef43ef8e1c7422")
print(soup.getText())
You can rely on the field label and get the next sibling's text.
Making a nice reusable function from this would make it more transparent and easy to use:
def get_field_value(soup, field):
field_label = soup.find('td', text=field + ':')
return field_label.find_next_sibling('td').get_text(strip=True)
Usage:
print(get_field_value(soup, 'Name')) # prints 'AWO-Seniorenzentrum Kenten'
print(get_field_value(soup, 'Land')) # prints 'Deutschland'

Facebook Python login script not using API

Ok, so I found a basic script to log in to facebook using python a while back. It didn't work - but after some tweaking (mainly around updating the post strings) it worked well for quite a while. Now it's stopped again - I suspect because facebook have changed their site a little.
I've tried making further tweaks having captured a login in Firefox and making sure I mimic as many of the post values etc as possible.
I need to log in to the site directly as I have a bunch of scripts that collect data that's available through a browser, but not through the API.
Having spent days trying to fix this I'm still drawing a blank... what am I missing?
import sys
import re
import urllib
import urllib2
import cookielib
import json
def main():
# Check the arguments
user = sys.argv[1]
passw = sys.argv[2]
# Initialize the needed modules
CHandler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
browser = urllib2.build_opener(CHandler)
browser.addheaders = [('Referer', 'http://login.facebook.com'),
('Content-Type', 'application/x-www-form-urlencoded'),
('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7 (.NET CLR 3.5.30729)')]
urllib2.install_opener(browser)
res = browser.open('http://m.facebook.com/index.php')
pg=res.read()
mxt = re.search('name="li" value="(\w+)"', pg)
mxt2 = re.search('name="m_ts" value="(\w+)"', pg)
mxt3 = re.search('name="lsd" value="(\w+)"', pg)
mxt4 = re.search('name="locale" value="(\w+)"', pg)
li = mxt.group(1)
m_ts = mxt2.group(1)
lsd = mxt3.group(1)
locale = mxt4.group(1)
res.close()
# Initialize the POST data
data = urllib.urlencode({
'lsd' : lsd,
'charset_test' : urllib.unquote_plus('%E2%82%AC%2C%C2%B4%2C%E2%82%AC%2C%C2%B4%2C%E6%B0%B4%2C%D0%94%2C%D0%84'),
'version' : '1',
'm_ts' : m_ts,
'li' : li,
'locale' : locale,
'signup_layout' : 'header_button',
'laststage' :'first',
'post_form_id' : pfi,
'email' : user,
'pass' : passw,
'login' : 'Log in'
})
url='https://login.facebook.com/login.php?login_attempt=1&non_com_login=&'+ data
res = urllib2.urlopen(url)
print ('%s' % url)
res.close()
# Get Access Token
res = browser.open('http://developers.facebook.com/docs/reference/api')
conft = res.read()
# For Debugging
fh = open('debug.html', 'w')
fh.write(conft)
fh.close
mat = re.search('access_token=(.*?)"', conft)
acct = mat.group(1)
print ('Using access token: %s' % acct)
For the record, here is the working answer for the above.
#!/usr/bin/python
import mechanize
browser = mechanize.Browser()
browser.set_handle_robots(False)
cookies = mechanize.CookieJar()
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7')]
browser.open("http://m.facebook.com/")
browser.select_form(nr=0)
browser.form['email'] = 'YOUR_LOGIN'
browser.form['pass'] = 'YOUR_PASSWORD'
response = browser.submit()

Categories