Stuck on this task - python

From an online python course:
You will be given a website with 100 names. All names are in the form of a link. Each link leads to another 100 links. You must use python to select the 18th link for 7 times, and print out the results.
my code so far:
z = 0
atags = []
listurl = []
#import modules
import urllib
from bs4 import BeautifulSoup
import re
newurl = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
while z < 7:
url = newurl
z = z + 1
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
soup.find_all("url")
a = soup.find_all('a')
for x in a:
atags.append(str(x))
url_end_full = atags[19]
url_end = re.findall(r'"(.*?)"', url_end_full)
url_end = str(url_end[0])
newurl = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/' + url_end
str(newurl)
listurl.append(newurl)
url = newurl
print url
It does not work. It keeps giving me the same link...
this is the output:
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lauchlin.html
[Finished in 2.4s]
the answer was wrong when i entered it into the answer box.

There are a couple of problems.
atags[19] is not the 18th item, it is the 20th (lst[0] is the first item in a list).
soup.find_all("url") does nothing; get rid of it.
you do not need re.
The links returned are relative; you are doing a hard-join to the base path to make them absolute. In this case it works, but that is a matter of luck; do it right with urljoin.
While str(link) does get you the url, the "proper" method is by indexing into the attributes, ie link['href'].
With some judicious cleanup,
from bs4 import BeautifulSoup
import sys
# version compatibility shim
if sys.hexversion < 0x3000000:
# Python 2.x
from urlparse import urljoin
from urllib import urlopen
else:
# Python 3.x
from urllib.parse import urljoin
from urllib.request import urlopen
START_URL = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
STEPS = 7
ITEM = 18
def get_soup(url):
with urlopen(url) as page:
return BeautifulSoup(page.read(), 'lxml')
def main():
url = START_URL
for step in range(STEPS):
print("\nStep {}: looking at '{}'".format(step, url))
# get the right item (Python arrays start indexing at 0)
links = get_soup(url).find_all("a")
rel_url = links[ITEM - 1]["href"]
# convert from relative to absolute url
url = urljoin(url, rel_url)
print(" go to '{}'".format(url))
if __name__=="__main__":
main()
which, if I did it right, ends with known_by_Gideon.html

Related

Python: Web Scraping with input from user

from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import requests
url = 'https://en.wikisource.org/wiki/Main_Page'
r = requests.get(url)
Soup = BeautifulSoup(r.text, "html5lib")
List = Soup.find("div",class_="enws-mainpage-widget-content", id="enws-mainpage-newtexts-content").find_all('a')
ebooks=[]
i=0
for ebook in List:
x=ebook.get('title')
for ch in x:
if(ch==":"):
x=""
if x!="":
ebooks.append(x)
i=i+1
print("Please select a book: ")
inputnumber=0
while inputnumber<len(ebooks):
print(inputnumber+1, " - ", ebooks[inputnumber])
inputnumber=inputnumber+1
input=int(input())
selectedbook = Soup.find("href", title=ebooks[input-1])
print(selectedbook)
I want to get the href of whichever was selected by user but as output I get: None
Can someone please tell me where I am doing wrong
I changed the last two lines of your code, and added these
selectedbook = Soup.find("a", title=ebooks[input-1])
print(selectedbook['title'])
print("https://en.wikisource.org/"+selectedbook['href'])
This just works !.
NB: The find() method searches for the first tag with the specified name and returns an object of type bs4.element.Tag.

Crawling over a website directories using BeautifulSoup?

This is my code:
https://pastebin.com/R11qiTF4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from urllib.parse import urljoin
import re
urls = ["https://www.helios-gesundheit.de"]
domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
prohibited = ["info", "news"]
text_keywords = ["Helios", "Helios"]
url_list = []
desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
for x in range(len(domain_list)):
url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
print(url_list)
def prohibitedChecker(prohibited_list, string):
for x in prohibited_list:
if x in string:
return True
else:
return False
break
def parseHTML(url):
requestHTML = req(url)
htmlPage = requestHTML.read()
requestHTML.close()
parsedHTML = soup(htmlPage, "html.parser")
return parsedHTML
searched_word = "Helios"
for url in url_list:
parsedHTML = parseHTML(url)
href_crawler = parsedHTML.find_all("a", href=True)
for href in href_crawler:
crawled_url = urljoin(url,href.get("href"))
print(crawled_url)
if "www" not in crawled_url:
continue
parsedHTML = parseHTML(crawled_url)
results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
for single_result in results:
keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
if keyword_text_check != True:
continue
print(single_result.string)
I'm trying to print the contents of ''desired'' variable. The problem is the following, my code doesn't even get to request the URL of ''desired'' because its not in the website scope. ''desired'' href link is inside another href link that's inside the page I'm currently scraping. I thought I'd fix this by adding another for loop inside line 39 for loop, that requests every href found in my first, but this is too messy and not efficient
Is there way to get a list of every directory of a website url?

How to scrape data from website having "View More" option using BeautifulSoup library in python

I am trying to parse comments from this website link :
I need to get 1000 comments, by default it shows only 10
I want to get 1000 comments, it shows only 10 by default. I am unable to figure out a way to get the content which shows on the webpage after clicking 'View More'
I have the following code uptil now:
import urllib.request
from bs4 import BeautifulSoup
import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
response = urllib.request.urlopen("https://www.mygov.in/group-issue/share-
your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/")
srcode = response.read()
soup = BeautifulSoup(srcode, "html.parser")
all_comments_div=soup.find_all('div', class_="comment_body");
all_comments=[]
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
print (all_comments)
print (len(all_comments))
You can use a while loop to get the next pages
( ie while there is a next page and all comments are less than 1000 )
import urllib.request
from bs4 import BeautifulSoup
import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []
max_comments = 1000
base_url = 'https://www.mygov.in/'
next_page = base_url + '/group-issue/share-your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/'
while next_page and len(all_comments) < max_comments :
response = response = urllib.request.urlopen(next_page)
srcode = response.read()
soup = BeautifulSoup(srcode, "html.parser")
all_comments_div=soup.find_all('div', class_="comment_body");
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
next_page = soup.find('li', class_='pager-next first last')
if next_page :
next_page = base_url + next_page.find('a').get('href')
print('comments: {}'.format(len(all_comments)))
print(all_comments)
print(len(all_comments))
The new comments are loaded via ajax, we need to parse it and then use bs, i.e.:
import json
import requests
import sys
from bs4 import BeautifulSoup
how_many_pages = 5 # how many comments pages you want to parse?
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []
for x in range(how_many_pages):
# note: mygov.in seems very slow...
json_data = requests.get(
"https://www.mygov.in/views/ajax/?view_name=view_comments&view_display_id=block_2&view_args=267721&view_path=node%2\
F267721&view_base_path=comment_pdf_export&view_dom_id=f3a7ae636cabc2c47a14cebc954a2ff0&pager_element=1&sort_by=created&sort_order=DESC&page=0,{}"\
.format(x)).content
d = json.loads(json_data.decode()) # Remove .decode() for python < 3
print(len(d))
if len(d) == 3: # sometimes json lenght is 3
comments = d[2]['data'] # data is the key that contains the comments html
elif len(d) == 2: # others just 2...
comments = d[1]['data']
#From here, we can use your BeautifulSoup code.
soup = BeautifulSoup(comments, "html.parser")
all_comments_div = soup.find_all('div', class_="comment_body");
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
print(all_comments)
Output:
["Sir my humble submission is that please ask public not to man handle doctors because they work in a very delicate situation, to save a patient is not always in his hand. The incidents of manhandling doctors is increasing day by day and it's becoming very difficult to work in these situatons. Majority are not Opting for medical profession,...']

Having problems following links with webcrawler

I am trying to create a webcrawler that parses all the html on the page, grabs a specified (via raw_input) link, follows that link, and then repeats this process a specified number of times (once again via raw_input). I am able to grab the first link and successfully print it. However, I am having problems "looping" the whole process, and usually grab the wrong link. This is the first link
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Fikret.html
(Full disclosure, this questions pertains to an assignment for a Coursera course)
Here's my code
import urllib
from BeautifulSoup import *
url = raw_input('Enter - ')
rpt=raw_input('Enter Position')
rpt=int(rpt)
cnt=raw_input('Enter Count')
cnt=int(cnt)
count=0
counts=0
tags=list()
soup=None
while x==0:
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags=soup.findAll('a')
for tag in tags:
url= tag.get('href')
count=count + 1
if count== rpt:
break
counts=counts + 1
if counts==cnt:
x==1
else: continue
print url
Based on DJanssens' response, I found the solution;
url = tags[position-1].get('href')
did the trick for me!
Thanks for the assistance!
I also worked on that course, and help with a friend, I got this worked out:
import urllib
from bs4 import BeautifulSoup
url = "http://python-data.dr-chuck.net/known_by_Happy.html"
rpt=7
position=18
count=0
counts=0
tags=list()
soup=None
x=0
while x==0:
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
tags=soup.findAll('a')
url= tags[position-1].get('href')
count=count + 1
if count == rpt:
break
print url
I believe this is what you are looking for:
import urllib
from bs4 import *
url = raw_input('Enter - ')
position=int(raw_input('Enter Position'))
count=int(raw_input('Enter Count'))
#perform the loop "count" times.
for _ in xrange(0,count):
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
tags=soup.findAll('a')
for tag in tags:
url= tag.get('href')
tags=soup.findAll('a')
# if the link does not exist at that position, show error.
if not tags[position-1]:
print "A link does not exist at that position."
# if the link at that position exist, overwrite it so the next search will use it.
url = tags[position-1].get('href')
print url
The code will now loop the amount of times as specified in the input, each time it will take the href at the given position and replace it with the url, in that way each loop will look further in the tree structure.
I advice you to use full names for variables, which is a lot easier to understand. In addition you could cast them and read them in a single line, which makes your beginning easier to follow.
Here is my 2-cents:
import urllib
#import ssl
from bs4 import BeautifulSoup
#'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
url = raw_input('Enter URL : ')
position = int(raw_input('Enter position : '))
count = int(raw_input('Enter count : '))
print('Retrieving: ' + url)
soup = BeautifulSoup(urllib.urlopen(url).read())
for x in range(1, count + 1):
link = list()
for tag in soup('a'):
link.append(tag.get('href', None))
print('Retrieving: ' + link[position - 1])
soup = BeautifulSoup(urllib.urlopen(link[position - 1]).read())

Get list of all paginated URL's from links in txt file in python requests

Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.
Here is an example of what i need done.
Input link
http://www.apartmentguide.com/apartments/Alabama/Hartselle/
Desired Output
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9
so on to any limit each Input Url have.
This is the function i written so far but its not working i am not good with Python either .
import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2
lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")
def get_links(line):
for each in line:
r = requests.get(each)
sel = Se(text=r.text, type="html")
next_ = sel.xpath('//a[#class="next sprite"]//#href').extract()
for next_1 in next_:
next_2 = "http://www.apartmentguide.com"+next_1
print next_2
get_links(next_1)
get_links(line)
Below are two ways to do this.
import mechanize
import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse
import pprint
#-- Mechanize --
br = mechanize.Browser()
def get_links_mechanize(root):
links = []
br.open(root)
for link in br.links():
try:
if dict(link.attrs)['class'] == 'page':
links.append(link.absolute_url)
except:
pass
return links
#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
links = []
r = requests.get(root)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
# for root in f:
# links = get_links(root)
# # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )
One uses mechanize -- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.
The other uses requests to fetch the page (urllib2 would suffice), BeautifulSoup to parse the markup and urlparse to form absolute URLs from the relative URLs in the page you listed.
Note that both of these functions return the following list:
['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']
which has duplicates. You can get rid of the duplicates by changing
return links
to
return list(set(links))
for whatever method you choose.
EDIT:
I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.
A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.
Since there are 20 results per page, figuring out how many pages is straightforward, consider:
import requests, re, math, pprint
def scrape_results(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
pprint.pprint(scrape_results(root))
This will be the fastest method of the 3, but possibly more error prone.
EDIT 2:
Maybe something like:
import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer
def get_pages(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
def get_listings(page):
links = []
r = requests.get(page)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
listings += get_listings(page)
pprint.pprint(listings)
print(len(listings))
With Re i was unsure ,so tried xpath.
links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")
for each in line:
lines = []
r = requests.get(each)
sel = Selector(text=r.text,type="html")
mat = sel.xpath('//h1//strong/text()').extract()
mat = str(mat)
mat1 = mat.replace(" apartments for rent']","")
mat2 = mat1.replace("[u'","")
mat3 = int(mat2)
num_pages = int(math.ceil(mat3/20.0))
for i in range(num_pages):
lines.append("%s/Page%d" % (each, (i+1)))
with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
writer = csv.writer(f)
for val in lines:
writer.writerow([val])

Categories