Select href link with python requests - python

I was trying to use the request library in python to search here in stack overflow for questions in search bar, than take the first 3 links founded, get the content of the pages and sent to my notionAPI but I stuck on how did I take the html where the link is directing me in python
import requests
from bs4 import BeautifulSoup
#buscas online
def stackoverflow(question):
questionAdjusted = question.replace(' ','+')
Req = requests.get("https://pt.stackoverflow.com/search?q="+questionAdjusted)
soup = BeautifulSoup(Req.text,"html.parser")
questions = soup.select(".question-summary")
for que in questions:
#print(que.select_one('.question-hyperlink').getText().replace('P: ',''))
#print((que.select_one('.question-hyperlink').getText().replace('P: ', '').replace(' ','-').replace('--------','')))
for link in soup.findAll('a', href=(que.select_one('.question-hyperlink').getText().replace('P: ', '').replace(' ','-').replace('--------',''))):
print(link['href'])
stackoverflow('python database')
I just made this until now

To get first 3 links + their description from the URL, you can use next example:
import requests
from bs4 import BeautifulSoup
def stackoverflow(question):
url = "https://pt.stackoverflow.com/search"
r = requests.get(url, params={"q": question})
soup = BeautifulSoup(r.content, "html.parser")
questions = soup.select(".question-hyperlink")
for q in questions[:3]: # <-- select ony first 3
print(q.get_text(strip=True).replace("P: ", ""))
print("https://pt.stackoverflow.com" + q["href"])
print()
stackoverflow("python database")
Prints:
Select from e insert into em outro database com Python
https://pt.stackoverflow.com/questions/376648/select-from-e-insert-into-em-outro-database-com-python?r=SearchResults
Finalizando um projeto em python [duplicada]
https://pt.stackoverflow.com/questions/259591/finalizando-um-projeto-em-python?r=SearchResults
Erro de conexão com SQL Server 2012 com Python
https://pt.stackoverflow.com/questions/478779/erro-de-conex%c3%a3o-com-sql-server-2012-com-python?r=SearchResults

Related

Web scraping with BeautifulSoup <span>

I'm trying to print the information inside a tag.
But I have an empty print.
There is the website: https://mubi.com/it/films/25-watts/cast?type=cast
I'm trying to print all actors name.
Here is my code:
import random
import requests
from bs4 import BeautifulSoup
url ='https://mubi.com/it/films/25-watts/cast?type=cast' #vincitori
def main():
response = requests.get(url)
html = response.text
soup1 = BeautifulSoup(html, 'html.parser')
cast = soup1.find_all('span', {'class' : 'css-1marmfu e1a7pc1u9'})
for tag in cast:
print(tag)
if __name__ == '__main__':
main()
Thank you for supporting ;)
The data you see on the page is loaded from an external URL via JavaScript (so beautifulsoup doesn't see it). You can use the requests module to simulate the Ajax request:
import json
import requests
from bs4 import BeautifulSoup
url = "https://mubi.com/it/films/25-watts/cast?type=cast"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.select_one("#__NEXT_DATA__").contents[0])
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
film = data["props"]["initialProps"]["pageProps"]["film"]
cast_url = "https://api.mubi.com/v3/films/{}/cast_members?sort=relevance&type=cast&page=1"
cast = requests.get(
cast_url.format(film["id"]),
headers={"CLIENT": "web", "Client-Country": "US"},
).json()
# print(json.dumps(cast, indent=4))
for m in cast["cast_members"]:
print("{:<30} {:<30}".format(m["name"], m["primary_type"] or "-"))
Prints:
Daniel Hendler Actor
Jorge Temponi Actor
Alfonso Tort Actor
Valentín Rivero -
Federico Veiroj Director
Valeria Mendieta -
Roberto Suárez Actor
Gonzalo Eyherabide -
Robert Moré Actor
Ignacio Mendy -

How to scrape address (comma separated text) using Beautifulsoup in python

I am trying to scrape address from the below link:
https://www.yelp.com/biz/rollin-phatties-houston
But I am getting only the first value of the address (i.e.: 1731 Westheimer Rd) out of complete address which is separated by a comma:
1731 Westheimer Rd, Houston, TX 77098
Can anyone help me out in this, please find my code below:
import bs4 as bs
import urllib.request as url
source = url.urlopen('https://www.yelp.com/biz/rollin-phatties-houston')
soup = bs.BeautifulSoup(source, 'html.parser')
mains = soup.find_all("div", {"class": "secondaryAttributes__09f24__3db5x arrange-unit__09f24__1gZC1 border-color--default__09f24__R1nRO"})
main = mains[0] #First item of mains
address = []
for main in mains:
try:
address.append(main.address.find("p").text)
except:
address.append("")
print(address)
# 1731 Westheimer Rd
import requests
import re
from ast import literal_eval
def main(url):
r = requests.get(url)
match = literal_eval(
re.search(r'addressLines.+?(\[.+?])', r.text).group(1))
print(*match)
main('https://www.yelp.com/biz/rollin-phatties-houston')
Output:
1731 Westheimer Rd Houston, TX 77098
There is no need to find the address information by inspecting the element, actually, the data inside a javascript tag element is passed onto the page already. You can get it by the following code
import chompjs
import bs4 as bs
import urllib.request as url
source = url.urlopen('https://www.yelp.com/biz/rollin-phatties-houston')
soup = bs.BeautifulSoup(source, 'html.parser')
javascript = soup.select("script")[16].string
data = chompjs.parse_js_object(javascript)
data['bizDetailsPageProps']['bizContactInfoProps']['businessAddress']
The business address that is shown on the webpage is generated dynamically. If you view Page Source of the URL, you will find that the address of the restaurant is stored in a script element. So you need to extract the address from it.
from bs4 import BeautifulSoup
import requests
import json
page = requests.get('https://www.yelp.com/biz/rollin-phatties-houston')
htmlpage = BeautifulSoup(page.text, 'html.parser')
scriptelements = htmlpage.find_all('script', attrs={'type':'application/json'})
scriptcontent = scriptelements[2].text
scriptcontent = scriptcontent.replace('<!--', '')
scriptcontent = scriptcontent.replace('-->', '')
jsondata = json.loads(scriptcontent)
print(jsondata['bizDetailsPageProps']['bizContactInfoProps']['businessAddress'])
Using the above code, you will be able to extract the address of any business.

How to parse the top 250 movie title using Regular expression in Python

Hi I'm practicing the regular expression with Python to parse the titles of top250 movies from IMDb but I am having difficulties to search contents between two tags like:
The Godfather
import re, urllib.request
def movie(url):
web_page = urllib.request.urlopen(url)
lines = web_page.read().decode(errors = "replace")
web_page.close()
return re.findall('(?<=.+?(?=)', lines, re.DOTALL)
title = movie("https://www.imdb.com/search/title?groups=top_250&sort=user_rating")
for name in title:
print(name)
As pointed in the comments, you better give a try on BeautifulSoup. Something like this will list the titles, in Python3:
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.imdb.com/search/title?groups=top_250&sort=user_rating')
if html.ok:
soup = BeautifulSoup(html.text, 'html.parser')
html.close()
for title in soup('h3', 'lister-item-header'):
print(title('a')[0].get_text())
And here is a cleaner version of the code above:
import requests
from bs4 import BeautifulSoup
imdb_entry_point = 'https://www.imdb.com/search/title'
imdb_payload = {
'groups': 'top_250',
'sort': 'user_rating'
}
with requests.get(imdb_entry_point, imdb_payload) as imdb:
if imdb.ok:
html = BeautifulSoup(imdb.text, 'html.parser')
for i, h3 in enumerate(html('h3', 'lister-item-header'), 1):
for a in h3('a'):
print(i, a.get_text())
BTW, that entry point is returning just 50 results and not 250 as you are expecting.
here is a working solution, using both BeautifulSoup and some nasty regex, but it's working fine. I love regex but it seems that I make them in a weird way, I can explaine to you how they works if you want.
import re, urllib.request
from bs4 import BeautifulSoup
url = "https://www.imdb.com/search/title?groups=top_250&sort=user_rating"
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
i = 0
for txt in soup.findAll(attrs={"class" :"lister-item-header"}):
i += 1
print(str(i) + " ." + re.match("""^.*>(.*)</a>.*$""", re.sub('"', '', re.sub('\n', '', str(txt)))).group(1))
My output : (it's french...)
Les évadés
Le parrain
The Dark Knight: Le chevalier noir
Le parrain, 2ème partie
Le seigneur des anneaux: Le retour du roi
And the list goes on...

trying to get all imgs from web python + bs4

First of all I'll say that, as my code comments are in Spanish, I'll try to explain them in English even though the code is pretty obvious and easy to understand. Don't feel insulted if I'm explaining things that are too obvious :)
So I'm trying to get all img from a website but it seems it just doesn't want to. I've read some similar articles but none seem to work.
import requests
from bs4 import BeautifulSoup as bs
import os
You can visit the web and see the html yourself.
# url de las imgs
url = 'https://dbz.space/cards/'
Here request the web page for it to be parsed
# descargamos la pagina para scrapear
page = requests.get(url)
soup = bs(page.text, 'html.parser')
Here I search for all the images with that class
# localizamos todas las imgs con esa clase
image_tags = soup.findAll("img", {"class": "thumb on"})
Here i just look if the folder imgs exist and if it doesn't then create one to then go inside it
# si no existe imgs lo creamos
if not os.path.exists('imgs'):
os.makedirs('imgs')
# cambiamos de directorio
os.chdir('imgs')
A variable for then naming all images
# para el nombre de la imagen
x = 0
And finally the saving process
# guardando imagenes
for image in image_tags:
try:
url = image['src']
response = requests.get(url)
if response.status_code == 200:
with open('img-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
print('Nueva imagen en carpeta')
except:
pass
So, the imgs on the web are inside a div tag and they have the class "thumb on" and they also contain the src (obviously) link which is the one I want to get to my folder called "imgs"
If all you want is the URL of the image file itself...
> <img class="thumb on"
> src="https://dbz.s3.amazonaws.com/v2/global/character/thumb/card_1011720_thumb.png">
Then simply...
yourBSobj.find("img", {"class": "thumb on"}).attrs['src']
I would use find_all() actually so you can iterate through a loop of images, do your processing/saving etc and then see your results afterwards.
First of all, as #cricket_007 said, the img tags are indeed loaded asynchronously by JavaScript. But, there is no need of using Selenium.
Upon inspection, you can see that each img tag is located inside this tag:
<div class="..." res="..." base="..." aim="" quantity="" release="1" imgur="x">
This tag is available in the source code (i.e. Not loaded by JavaScript). Here, we can get the x value which is a part of the imgur URL. One example:
<div class="..." res="1010160" base="1010161" aim="" quantity="" release="1" imgur="yK0wNs3">
After getting the imgur value, you can make the URL like this:
'https://i.imgur.com/{}.png'.format(imgur)
As the URL is https://i.imgur.com/yK0wNs3.png.
Complete code:
r = requests.get('https://dbz.space/cards/')
#soup = BeautifulSoup(r.text, 'lxml')
soup = bs(r.text, 'html.parser')
if not os.path.exists('imgs'):
os.makedirs('imgs')
os.chdir('imgs')
i = 0
for item in soup.find_all('div', imgur=True):
imgur = item['imgur']
if imgur:
r = requests.get('https://i.imgur.com/{}.png'.format(imgur))
with open('img-{}.jpg'.format(i), 'wb') as f:
f.write(r.content)
i += 1
Partial Output:
Note: I'm using f.write(r.content) and not f.write(requests.get(url).content). There's no need to send one more request.
So the error that poped saying File "pilla.py", line 6, in <module> soup = BeautifulSoup(r.text, 'lxml') NameError: name 'BeautifulSoup' is not defined
Is solved by changing on the variable soup BeautifulSoup for bs and lxlm for html.parser Complete code is right here:
import requests
from bs4 import BeautifulSoup as bs
import os
r = requests.get('https://dbz.space/cards/')
soup = bs(r.text, 'html.parser')
if not os.path.exists('imgs'):
os.makedirs('imgs')
os.chdir('imgs')
i = 0
for item in soup.find_all('div', imgur=True):
imgur = item['imgur']
if imgur:
r = requests.get('https://i.imgur.com/{}.png'.format(imgur))
with open('img-{}.jpg'.format(i), 'wb') as f:
f.write(r.content)
i += 1
Thank you all very much for the help. Really apreciate it :)

Scraping links from buttons on a page

I am trying to scrape the links from the "box score" button on this page. The button is supposed to look like this
http://www.espn.com/nfl/boxscore?gameId=400874795
I tried to use this code to see if I could access the buttons but I cannot.
from bs4 import BeautifulSoup
import requests
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
advanced = url
r = requests.get(advanced)
data = r.text
soup = BeautifulSoup(data,"html.parser")
for link in soup.find_all('a'):
print link
As wpercy mentions in his comment, you can't do this using requests, as a suggestion you should use selenium together with Chromedriver/PhantomJS for handling the JavaScript:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2"
browser = webdriver.Chrome()
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
boxList = soup.findAll('a',{'name':'&lpos=nfl:scoreboard:boxscore'})
All score buttons's a tag have the attribute name = &lpos=nfl:scoreboard:boxscore, so we first use .findAll and now a simple list comprehension can extract each href attribute:
>>> links = [box['href'] for box in boxList]
>>> links
['/nfl/boxscore?gameId=400874795', '/nfl/boxscore?gameId=400874854', '/nfl/boxscore?gameId=400874753', '/nfl/boxscore?gameId=400874757', '/nfl/boxscore?gameId=400874772', '/nfl/boxscore?gameId=400874777', '/nfl/boxscore?gameId=400874767', '/nfl/boxscore?gameId=400874812', '/nfl/boxscore?gameId=400874761', '/nfl/boxscore?gameId=400874764', '/nfl/boxscore?gameId=400874781', '/nfl/boxscore?gameId=400874796', '/nfl/boxscore?gameId=400874750', '/nfl/boxscore?gameId=400873867', '/nfl/boxscore?gameId=400874775', '/nfl/boxscore?gameId=400874798']
here is the solution i did , and it scrapes all the link which are there on the url you have provided in your answer . you can check it out
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print i;
ans = tag.get('href',None)
print ans;
print "\n";
The answer from Gopal Chitalia didn't work for me, so I decided to post the working one (for python 3.6.5)
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.request.urlopen(url)
# urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print (i);
ans = tag.get('href',None)
print (ans);
print ("\n");

Categories