I've created a script in python using requests module to fetch some information displayed upon filling in a form using this email africk2#nd.edu. The problem is when I hit the search button, I can see a new tab containing all the information I wish to grab. Moreover, I don't see any link in the All tab under Network section within chrome dev tools. So, I'm hopeless as to how I can get the information using requests module.
website address
Steps to populate the result manually:
Put this email address africk2#nd.edu next to the inputbox of Email address and hit the Search button.
I've tried with:
import requests
from bs4 import BeautifulSoup
url = "https://eds.nd.edu/search/index.shtml"
post_url = "https://eds.nd.edu/cgi-bin/nd_ldap_search.pl"
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
payload = {item['name']:item.get('value','') for item in soup.select('input[name]')}
payload['email'] = 'africk2#nd.edu'
del payload['clear']
resp = requests.post(post_url,data=payload)
print(resp.content)
The above script is a faulty approach. However, I can't find any idea to grab the information connected to that email.
P.S. I'm not after selenium-oriented solution.
Ok, solved it:
from urllib.parse import quote
import requests
def get_contact_html(email: str):
encoded = quote('o=\"University of Notre Dame\", '
'st=Indiana, '
'c=US?displayName,edupersonaffiliation,ndTitle,ndDepartment,postalAddress,telephoneNumber,mail,searchGuide,labeledURI,'
'uid?'
'sub?'
f'(&(ndMail=*{email}*))')
data = {
"ldapurl": f'LDAP://directory.nd.edu:389/{encoded}',
"ldaphost": "directory.nd.edu",
"ldapport": '389',
"ldapbase": 'o="University of Notre Dame", st=Indiana, c=US',
"ldapfilter": f'(&(ndMail=*{email}*))',
"ldapheadattr": "displayname",
"displayformat": "nd",
"ldapmask": "",
"ldapscope": "",
"ldapsort": "",
"ldapmailattr": "",
"ldapurlattr": "",
"ldapaltattr": "",
"ldapjpgattr": "",
"ldapdnattr": "",
}
res = requests.post('https://eds.nd.edu/cgi-bin/nd_ldap_search.pl',
data=data)
res.raise_for_status()
return res.text
if __name__ == '__main__':
html = get_contact_html('africk2#nd.edu')
print(html)
output:
...
Formal Name:
...
Aaron D Frick
...
this will give you the HTML for the page.
The trick was converting encoded spaces + to real spaces in "ldapbase": 'o="University of Notre Dame", st=Indiana, c=US', field and letting requests module to encode the value itself. Otherwise + signs get double encoded.
Related
I'm trying to use pagination to request multiple pages of rent listing from zillow. Otherwise I'm limited to the first page only. However, my code seems to load the first page only even if I specify specific pages manually.
# Rent
import requests
from bs4 import BeautifulSoup as soup
import json
url = 'https://www.zillow.com/torrance-ca/rentals'
params = {
'q': {"pagination":{"currentPage": 1},"isMapVisible":False,"filterState":{"fore":{"value":False},"mf":{"value":False},"ah":{"value":True},"auc":{"value":False},"nc":{"value":False},"fr":{"value":True},"land":{"value":False},"manu":{"value":False},"fsbo":{"value":False},"cmsn":{"value":False},"fsba":{"value":False}},"isListVisible":True}
}
headers = {
# headers were copied from network tab on developer tools in chrome
}
html = requests.get(url=url,headers=headers, params=params)
html.status_code
bsobj = soup(html.content, 'lxml')
for script in bsobj.find_all('script'):
inner_text_with_string = str(script.string)
if inner_text_with_string[:18] == '<!--{"queryState":':
my_query = inner_text_with_string
my_query = my_query.strip('><!-')
data = json.loads(my_query)
data = data['cat1']['searchResults']['listResults']
print(data)
This returns about 40 listings. However, if I change "pagination":{"currentPage": 1} to "pagination":{"currentPage": 2}, it returns the same listings! It's as if the pagination parameter isn't recognized.
I believe these are the correct parameters, as I took them straight from the url string query and used http://urlprettyprint.com/ to make it pretty.
Any thoughts on what I'm doing wrong?
Using the params argument with requests is sending the wrong data, you can confirm this by printing response.url. what i would do is use urllib.parse.urlencode:
from urllib.parse import urlencode
...
html = requests.get(url=f"{url}?{urlencode(params)}", headers=headers)
Basically, I'm trying to remove all the characters after the URL extension in a URL, but it's proving difficult. The application works off a list of various URLs with various extensions.
Here's my source:
import requests
from bs4 import BeautifulSoup
from time import sleep
#takes userinput for path of panels they want tested
import_file_path = input('Enter the path of the websites to be tested: ')
#takes userinput for path of exported file
export_file_path = input('Enter the path of where we should export the panels to: ')
#reads imported panels
with open(import_file_path, 'r') as panels:
panel_list = []
for line in panels:
panel_list.append(line)
x = 0
for panel in panel_list:
url = requests.get(panel)
soup = BeautifulSoup(url.content, "html.parser")
forms = soup.find_all("form")
action = soup.find('form').get('action')
values = {
soup.find_all("input")[0].get("name") : "user",
soup.find_all("input")[1].get("name") : "pass"
}
print(values)
r = requests.post(action, data=values)
print(r.headers)
print(r.status_code)
print(action)
sleep(10)
x += 1
What I'm trying to achieve is an application that automatically tests your username/password from a list of URLs provided in a text document. However, BeautifulSoup returns an incomplete URL when crawling for action tags, i.e instead of returning the full http://example.com/action.php it will return action.php as it would be in the code. The only way I can think to get past this would be to restate the 'action' variable as 'panel' with all characters after the url extension removed, followed by 'action'.
Thanks!
For last few days I am trying to scrap the following website (link pasted below) which has a few excels and pdfs available in a table. I am able to do it for the home page successfully. There are total 59 pages from which these excels/ pdfs have to be scrapped. In most of the websites I have seen till now there is a query parameter which is available in the site url which changes as you move from one page to another. In this case, we have a _doPostBack function probably because of which the URL remains the same on every page you go to. I looked at multiple solutions and posts which are suggesting to see the parameters of post call and use them but I am not able to make sense of the parameters which are provided in post call (this is the first time I am scrapping a website).
Can someone please suggest some resource which can help me write a code which helps me in moving from one page to another using python. The details are as follows:
Website link - http://accord.fairfactories.org/ffcweb/Web/ManageSuppliers/InspectionReportsEnglish.aspx
My current code which extracts the CAP excel sheet from the home page (this is working perfect and is provided just for reference)
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
import re
import urllib
Base = "http://accord.fairfactories.org/ffcweb/Web"
html = urlopen("http://accord.fairfactories.org/ffcweb/Web/ManageSuppliers/InspectionReportsEnglish.aspx")
bs = BeautifulSoup(html)
name = bs.findAll("td", {"class":"column_style_right column_style_left"})
i = 1
for link in bs.findAll("a", {"id":re.compile("CAP(?!\w)")}):
if 'href' in link.attrs:
name = str(i)+".xlsx"
a = link.attrs['href']
b = a.strip("..")
c = Base+b
urlretrieve(c, name)
i = i+1
Please let me know if I have missed anything while providing the information and please don't rate me -ve else I won't be able to ask any questions further
For aspx sites, you need to look for things like __EVENTTARGET, __EVENTVALIDATION etc.. and post those parameters with each request, this will get all the pages and using requests with bs4:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin # python 3 use from urllib.parse import urljoin
# All the keys need values set bar __EVENTTARGET, that stays the same.
data = {
"__EVENTTARGET": "gvFlex",
"__VIEWSTATE": "",
"__VIEWSTATEGENERATOR": "",
"__VIEWSTATEENCRYPTED": "",
"__EVENTVALIDATION": ""}
def validate(soup, data):
for k in data:
# update post values in data.
if k != "__EVENTTARGET":
data[k] = soup.select_one("#{}".format(k))["value"]
def get_all_excel():
base = "http://accord.fairfactories.org/ffcweb/Web"
url = "http://accord.fairfactories.org/ffcweb/Web/ManageSuppliers/InspectionReportsEnglish.aspx"
with requests.Session() as s:
# Add a user agent for each subsequent request.
s.headers.update({"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0"})
r = s.get(url)
bs = BeautifulSoup(r.content, "lxml")
# get links from initial page.
for xcl in bs.select("a[id*=CAP]"):
yield urljoin(base, xcl["href"])
# need to re-validate the post data in our dict for each request.
validate(bs, data)
last = bs.select_one("a[href*=Page$Last]")
i = 2
# keep going until the last page button is not visible
while last:
# Increase the counter to set the target to the next page
data["__EVENTARGUMENT"] = "Page${}".format(i)
r = s.post(url, data=data)
bs = BeautifulSoup(r.content, "lxml")
for xcl in bs.select("a[id*=CAP]"):
yield urljoin(base, xcl["href"])
last = bs.select_one("a[href*=Page$Last]")
# again re-validate for next request
validate(bs, data)
i += 1
for x in (get_all_excel()):
print(x)
If we run it on the first three pages, you can see we get the data you want:
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9965
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9552
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10650
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11969
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10086
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10905
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10840
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9229
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11310
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9178
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9614
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9734
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10063
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10871
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9468
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9799
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9278
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=12252
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9342
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9966
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11595
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9652
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10271
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10365
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10087
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9967
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11740
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=12375
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11643
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10952
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=12013
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9810
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10953
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10038
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9664
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=12256
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9262
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9210
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9968
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9811
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11610
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9455
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11899
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10273
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9766
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9969
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10088
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10366
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9393
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9813
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11795
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9814
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11273
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=12187
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10954
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9556
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11709
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9676
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10251
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10602
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10089
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9908
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10358
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9469
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11333
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9238
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9816
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9817
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10736
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10622
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9394
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9818
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=10592
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=9395
http://accord.fairfactories.org/Utilities/DownloadFile.aspx?id=11271
I have a hard time figuring out a correct path with my web scraping code.
I am trying to scrape different info from http://financials.morningstar.com/company-profile/c.action?t=AAPL.
I have tried several paths, and some seem to work and some not.
I am interested in CIK under Operation Details
page = requests.get('http://financials.morningstar.com/company-profile/c.action?t=AAPL')
tree=html.fromstring(page.text)
#desc = tree.xpath('//div[#class="r_title"]/span[#class="gry"]/text()') #works
#desc = tree.xpath('//div[#class="wrapper"]//div[#class="headerwrap"]//div[#class="h_Logo"]//div[#class="h_Logo_row1"]//div[#class="greeter"]/text()') #works
#desc = tree.xpath('//div[#id="OAS_TopLeft"]//script[#type="text/javascript"]/text()') #works
desc = tree.xpath('//div[#class="col2"]//div[#id="OperationDetails"]//table[#class="r_table1 r_txt2"]//tbody//tr//th[#class="row_lbl"]/text()')
I can't figure the last path. It seems like I am following the path correctly, but I get empty list.
The problem is that Operational Details are loaded separately with an additional GET request. Simulate it in your code maintaining a web-scrapin session:
import requests
from lxml import html
with requests.Session() as session:
page = session.get('http://financials.morningstar.com/company-profile/c.action?t=AAPL')
tree = html.fromstring(page.text)
# get the operational details
response = session.get("http://financials.morningstar.com/company-profile/component.action", params={
"component": "OperationDetails",
"t": "XNAS:AAPL",
"region": "usa",
"culture": "en-US",
"cur": "",
"_": "1444848178406"
})
tree_details = html.fromstring(response.content)
print tree_details.xpath('.//th[#class="row_lbl"]//text()')
Old answer:
It's just that you should remove tbody from the expression:
//div[#class="col2"]//div[#id="OperationDetails"]//table[#class="r_table1 r_txt2"]//tr//th[#class="row_lbl"]/text()
tbody is an element that is inserted by the browser to define the data rows in a table.
I'm trying to scrape "game tag" data (not the same as HTML tags) from games listed on the digital game distribution site, Steam (store.steampowered.com). This information isn't available via the Steam API, as far as I can tell.
Once I have the raw source data for a page, I want to pass it into beautifulsoup for further parsing, but I have a problem - urllib2 doesn't seem to be reading the information I want (request doesn't work either), even though it's obviously in the source page when viewed in the browser.
For example, I might download the page for the game "7 Days to Die" (http://store.steampowered.com/app/251570/). When viewing the browser source page in Chrome, I can see the following relevant information regarding the game's "tags"
near the end, starting at line 1615:
<script type="text/javascript">
$J( function() {
InitAppTagModal( 251570,
{"tagid":1662,"name":"Survival","count":283,"browseable":true},
{"tagid":1659,"name":"Zombies","count":274,"browseable":true},
{"tagid":1702,"name":"Crafting","count":248,"browseable":true},...
In initAppTagModal, there are the tags "Survival", "Zombies", "Crafting", ect that contain the information I'd like to collect.
But when I use urllib2 to get the page source:
import urllib2
url = "http://store.steampowered.com/app/224600/" #7 Days to Die page
page = urllib2.urlopen(url).read()
The part of the source page that I'm interested in is not saved in the my "page" variable, instead everything below line 1555 is simply blank until the closing body and html tags. Resulting in this (carriage returns included):
</div><!-- End Footer -->
</body>
</html>
In the blank space is where the source code I need (along with other code), should be.
I've tried this on several different computers with different installs of python 2.7 (Windows machines and a Mac), and I get the same result on all of them.
How can I get the data that I'm looking for?
Thank you for your consideration.
Well, I don't know if I'm missing something, but it's working for me using requests:
import requests
# Getting html code
url = "http://store.steampowered.com/app/251570/"
html = requests.get(url).text
And even more, the data requested is in json format, so it's easy to extract it in this way:
# Extracting javscript object (a json like object)
start_tag = 'InitAppTagModal( 251570,'
end_tag = '],'
startIndex = html.find(start_tag) + len(start_tag)
endIndex = html.find(end_tag, startIndex) + len(end_tag) - 1
raw_data = html[startIndex:endIndex]
# Load raw data as python json object
data = json.loads(raw_data)
You will see a beatiful json object like this (this is the info that you need, right?):
[
{
"count": 283,
"browseable": true,
"tagid": 1662,
"name": "Survival"
},
{
"count": 274,
"browseable": true,
"tagid": 1659,
"name": "Zombies"
},
{
"count": 248,
"browseable": true,
"tagid": 1702,
"name": "Crafting"
}......
I hope it helps....
UPDATED:
Ok, I see your problem right now, it seems that the problem is in the page 224600. In this case the webpage requires that you confirm your age before to show you the games info. Anyway, easy to solve it just posting the form that confirm the age. Here is the code updated (and I created a function):
def extract_info_games(page_id):
# Create session
session = requests.session()
# Get initial html
html = session.get("http://store.steampowered.com/app/%s/" % page_id).text
# Checking if I'm in the check age page (just checking if the check age form is in the html code)
if ('<form action="http://store.steampowered.com/agecheck/app/%s/"' % page_id) in html:
# I'm being redirected to check age page
# let's confirm my age with a POST:
post_data = {
'snr':'1_agecheck_agecheck__age-gate',
'ageDay':1,
'ageMonth':'January',
'ageYear':'1960'
}
html = session.post('http://store.steampowered.com/agecheck/app/%s/' % page_id, post_data).text
# Extracting javscript object (a json like object)
start_tag = 'InitAppTagModal( %s,' % page_id
end_tag = '],'
startIndex = html.find(start_tag) + len(start_tag)
endIndex = html.find(end_tag, startIndex) + len(end_tag) - 1
raw_data = html[startIndex:endIndex]
# Load raw data as python json object
data = json.loads(raw_data)
return data
And to use it:
extract_info_games(224600)
extract_info_games(251570)
Enjoy!
When using urllib2 and read(), you will have to read repeatedly in chunks till you hit EOF, in order to read the entire HTML source.
import urllib2
url = "http://store.steampowered.com/app/224600/" #7 Days to Die page
url_handle = urllib2.urlopen(url)
data = ""
while True:
chunk = url_handle.read()
if not chunk:
break
data += chunk
An alternative would be to use the requests module as:
import requests
r = requests.get('http://store.steampowered.com/app/251570/')
soup = BeautifulSoup(r.text)