Python requests html printed response in array - python

I am trying to check if link contains http and print the URL.
import requests
from requests_html import HTMLSession
import sys
link = "http://www.tvil.me/view/93/4/8/v/%D7%90%D7%99%D7%99_%D7%96%D7%95%D7%9E%D7%91%D7%99_IZombie.html"
enter_episodes = HTMLSession().get(link)
page = enter_episodes.html
s = page.xpath("//*[#class='view-watch-button']/a")
for l in s:
link = l.links
if link != "set()":
print(link)
Response:
{'http://streamcloud.eu/ga4m4hizbrfb/iZombie.S04E08.HDTV.x264-SVA.mkv.html'}
{'http://uptostream.com/77p26f7twwhe'}
set()
{'https://clipwatching.com/aog2ni06rzjt/rrFhepnbFfpt6xg.mkv.html'}
set()
[Finished in 1.7s]
I tried to delete the set() response and to get only the link without {' and '}.

You just need to make sure the set has a length of more than one, and then pop it:
import requests
from requests_html import HTMLSession
import sys
link = "http://www.tvil.me/view/93/4/8/v/%D7%90%D7%99%D7%99_%D7%96%D7%95%D7%9E%D7%91%D7%99_IZombie.html"
enter_episodes = HTMLSession().get(link)
page = enter_episodes.html
s = page.xpath("//*[#class='view-watch-button']/a")
for l in s:
link = l.links
if len(link) > 0: # make sure it has a value
print(link.pop()) # get the last value (in your case, the only one)

Related

Crawling over a website directories using BeautifulSoup?

This is my code:
https://pastebin.com/R11qiTF4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from urllib.parse import urljoin
import re
urls = ["https://www.helios-gesundheit.de"]
domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
prohibited = ["info", "news"]
text_keywords = ["Helios", "Helios"]
url_list = []
desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
for x in range(len(domain_list)):
url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
print(url_list)
def prohibitedChecker(prohibited_list, string):
for x in prohibited_list:
if x in string:
return True
else:
return False
break
def parseHTML(url):
requestHTML = req(url)
htmlPage = requestHTML.read()
requestHTML.close()
parsedHTML = soup(htmlPage, "html.parser")
return parsedHTML
searched_word = "Helios"
for url in url_list:
parsedHTML = parseHTML(url)
href_crawler = parsedHTML.find_all("a", href=True)
for href in href_crawler:
crawled_url = urljoin(url,href.get("href"))
print(crawled_url)
if "www" not in crawled_url:
continue
parsedHTML = parseHTML(crawled_url)
results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
for single_result in results:
keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
if keyword_text_check != True:
continue
print(single_result.string)
I'm trying to print the contents of ''desired'' variable. The problem is the following, my code doesn't even get to request the URL of ''desired'' because its not in the website scope. ''desired'' href link is inside another href link that's inside the page I'm currently scraping. I thought I'd fix this by adding another for loop inside line 39 for loop, that requests every href found in my first, but this is too messy and not efficient
Is there way to get a list of every directory of a website url?

Beautifulsoup - Problems for webcrawler

How to correctly output all links on this news website? (in list form)
After output in list form, how can I return the result randomly (3~5 links a time)
note: the code I need starts from line 739 (nearly it may change a bit cause it refresh everyday)
div class="abdominis rlby clearmen"
and I need every link inside this kind of thing
<a href="https://tw.news.appledaily.com/life/realtime/20180308/1310910/>
Thanks!! the code is below:
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
#can output all links but with useless information
contents = soup.select("div[class='abdominis rlby clearmen']")[0].find_all('a')
print(contents)
#can output single link but not in list form
#contents = soup.select("div[class='abdominis rlby clearmen']")[0].find('a').get('href')
#print(contents)
Here is a solution which will append each link to a list if it is contained in the specified div..
from bs4 import BeautifulSoup
from flask import Flask, request, abort
import requests
import re
import random
import types
target_url = 'http://www.appledaily.com.tw/realtimenews/section/new/'
print('Start parsing appleNews....')
rs = requests.session()
res = rs.get(target_url, verify=False)
soup = BeautifulSoup(res.text, 'html.parser')
list_links = [] # Create empty list
for a in soup.select("div[class='abdominis rlby clearmen']")[0].findAll(href=True): # find links based on div
list_links.append(a['href']) #append to the list
print(a['href']) #Check links
for l in list_links: # print list to screen (2nd check)
print(l)
To create random links to be returned.
import random #import random module
random_list = [] #create random list if needed..
random.shuffle(list_links) #random shuffle the list
for i in range(5): # specify range (5 items in this instance)
try:
res = list_links.pop(random.randint(0, len(list_links))) # pop of each item randomly based on the size of the list
print(res) #print to screen..
random)list.append(res) # or append to random_list
except IndexError:
pass
One last edit as you asked for it to be returned..
Here it is as a function that returns a list of x amount of random links..
def return_random_link(list_, num):
""" Takes in a list and returns a random amount of items """
random.shuffle(list_)
random_list = []
for i in range(num):
try: # try to append to the list
r = list_.pop(random.randint(0, len(list_)))
random_list.append(r)
except IndexError: #except an IndexError (no items
return random_list # Return the list of items
return random_list
random_list = return_random_link(list_links, 5)
for i in random_list:
print(i)
If you want the link tag without its descendents, you can clear them:
for elm in contents:
elm.clear()
I image I'd be more interested in extracting just the links, though:
contents = [a['href'] for a in contents]
To get results in a random order, try using random.shuffle() and grabbing however many elements from the reshuffled list at a time you need.

How to scrape data from website having "View More" option using BeautifulSoup library in python

I am trying to parse comments from this website link :
I need to get 1000 comments, by default it shows only 10
I want to get 1000 comments, it shows only 10 by default. I am unable to figure out a way to get the content which shows on the webpage after clicking 'View More'
I have the following code uptil now:
import urllib.request
from bs4 import BeautifulSoup
import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
response = urllib.request.urlopen("https://www.mygov.in/group-issue/share-
your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/")
srcode = response.read()
soup = BeautifulSoup(srcode, "html.parser")
all_comments_div=soup.find_all('div', class_="comment_body");
all_comments=[]
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
print (all_comments)
print (len(all_comments))
You can use a while loop to get the next pages
( ie while there is a next page and all comments are less than 1000 )
import urllib.request
from bs4 import BeautifulSoup
import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []
max_comments = 1000
base_url = 'https://www.mygov.in/'
next_page = base_url + '/group-issue/share-your-ideas-pm-narendra-modis-mann-ki-baat-26th-march-2017/'
while next_page and len(all_comments) < max_comments :
response = response = urllib.request.urlopen(next_page)
srcode = response.read()
soup = BeautifulSoup(srcode, "html.parser")
all_comments_div=soup.find_all('div', class_="comment_body");
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
next_page = soup.find('li', class_='pager-next first last')
if next_page :
next_page = base_url + next_page.find('a').get('href')
print('comments: {}'.format(len(all_comments)))
print(all_comments)
print(len(all_comments))
The new comments are loaded via ajax, we need to parse it and then use bs, i.e.:
import json
import requests
import sys
from bs4 import BeautifulSoup
how_many_pages = 5 # how many comments pages you want to parse?
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
all_comments = []
for x in range(how_many_pages):
# note: mygov.in seems very slow...
json_data = requests.get(
"https://www.mygov.in/views/ajax/?view_name=view_comments&view_display_id=block_2&view_args=267721&view_path=node%2\
F267721&view_base_path=comment_pdf_export&view_dom_id=f3a7ae636cabc2c47a14cebc954a2ff0&pager_element=1&sort_by=created&sort_order=DESC&page=0,{}"\
.format(x)).content
d = json.loads(json_data.decode()) # Remove .decode() for python < 3
print(len(d))
if len(d) == 3: # sometimes json lenght is 3
comments = d[2]['data'] # data is the key that contains the comments html
elif len(d) == 2: # others just 2...
comments = d[1]['data']
#From here, we can use your BeautifulSoup code.
soup = BeautifulSoup(comments, "html.parser")
all_comments_div = soup.find_all('div', class_="comment_body");
for div in all_comments_div:
all_comments.append(div.find('p').text.translate(non_bmp_map))
print(all_comments)
Output:
["Sir my humble submission is that please ask public not to man handle doctors because they work in a very delicate situation, to save a patient is not always in his hand. The incidents of manhandling doctors is increasing day by day and it's becoming very difficult to work in these situatons. Majority are not Opting for medical profession,...']

Stuck on this task

From an online python course:
You will be given a website with 100 names. All names are in the form of a link. Each link leads to another 100 links. You must use python to select the 18th link for 7 times, and print out the results.
my code so far:
z = 0
atags = []
listurl = []
#import modules
import urllib
from bs4 import BeautifulSoup
import re
newurl = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
while z < 7:
url = newurl
z = z + 1
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
soup.find_all("url")
a = soup.find_all('a')
for x in a:
atags.append(str(x))
url_end_full = atags[19]
url_end = re.findall(r'"(.*?)"', url_end_full)
url_end = str(url_end[0])
newurl = 'https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/' + url_end
str(newurl)
listurl.append(newurl)
url = newurl
print url
It does not work. It keeps giving me the same link...
this is the output:
https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Lauchlin.html
[Finished in 2.4s]
the answer was wrong when i entered it into the answer box.
There are a couple of problems.
atags[19] is not the 18th item, it is the 20th (lst[0] is the first item in a list).
soup.find_all("url") does nothing; get rid of it.
you do not need re.
The links returned are relative; you are doing a hard-join to the base path to make them absolute. In this case it works, but that is a matter of luck; do it right with urljoin.
While str(link) does get you the url, the "proper" method is by indexing into the attributes, ie link['href'].
With some judicious cleanup,
from bs4 import BeautifulSoup
import sys
# version compatibility shim
if sys.hexversion < 0x3000000:
# Python 2.x
from urlparse import urljoin
from urllib import urlopen
else:
# Python 3.x
from urllib.parse import urljoin
from urllib.request import urlopen
START_URL = "https://pr4e.dr-chuck.com/tsugi/mod/python-data/data/known_by_Desmond.html"
STEPS = 7
ITEM = 18
def get_soup(url):
with urlopen(url) as page:
return BeautifulSoup(page.read(), 'lxml')
def main():
url = START_URL
for step in range(STEPS):
print("\nStep {}: looking at '{}'".format(step, url))
# get the right item (Python arrays start indexing at 0)
links = get_soup(url).find_all("a")
rel_url = links[ITEM - 1]["href"]
# convert from relative to absolute url
url = urljoin(url, rel_url)
print(" go to '{}'".format(url))
if __name__=="__main__":
main()
which, if I did it right, ends with known_by_Gideon.html

Get all internal links from all pages in a Domain

I am looking for a code which will get all internal links from a website by iterating all internals links [both absolute & relative] found.
So far I managed to write this much, but unable to construct the right logic in the program.
import requests, csv, time
from lxml import html
from collections import OrderedDict
links = []
domain = 'bunchball.com'
base_link = 'http://www.bunchball.com/'
unique_list = []
def get_links(base_link):
r = requests.get(base_link)
source = html.fromstring(r.content)
link = source.xpath('//a/#href')
for each in link:
each = str(each)
if domain in each:
links.append(each)
elif each.startswith('/'):
links.append(base_link+each)
unique_list.append(each)
else:
pass
get_links(base_link)
#while
for each1 in list(OrderedDict.fromkeys(links)):
get_links(each1)
while each1 not in unique_list:
unique_list.append(each1)
get_links(each1)
Try with mechanize for a simpler solution:
from mechanize import Browser
br = Browser()
br.open("http://www.bunchball.com/")
list_of_links=[link for link in br.links()]

Categories