I want to extract the IPA keys under the French section of the wiki page:
https://en.wiktionary.org/wiki/son#French
I want only the data in the french section.
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import pandas as pd
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span', {'class':'mw-headline'} and {'id':'French'})
for fr_ipas in french_section.find_next('span', {'class':'IPA'}):
ipa_data.append(fr_ipas)
fr_ipas_all = french_section.find_all_next('span', {'class':'IPA'})
find_next only returns the first element under the french section.
find_all and find_all_next returns a list of all the elements within the html.
I just want the elements under the french section. There are multiple IPA keys under the french section.
Close to your goal, but you have to check if the next elements
or .find_next_siblings() has your IPA element and break the iteration until there is a <hr>, that defines the next section:
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'})
Example
from bs4 import BeautifulSoup
import requests
def main():
test_url_page = 'https://en.wiktionary.org/wiki/son#French'
req = requests.get(test_url_page)
content = req.text
ipa_data = []
soup = BeautifulSoup(content, 'html.parser')
french_section = soup.find('span',{'id':'French'}).parent
for tag in french_section.find_next_siblings():
if tag == 'hr':
break
if tag.find('span', {'class':'IPA'}):
ipa_data.append(tag.find('span', {'class':'IPA'}))
return ipa_data
main()
I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)
I have scraped a website that provides me with Lisbon zip-codes. With BeautifulSoup I was able to get the zip-codes within a class item. However, the zip-codes themselves are still within other classes and I have tried many things to extract all of them from there. However, except for string-manipulation, I couldn't make it work. I am new to webscraping and html, so sorry if this question is very basic..
This is my code:
from bs4 import BeautifulSoup as soup
from requests import get
url='https://worldpostalcode.com/portugal/lisboa/'
response = get(url)
print(response.text)
html_soup = soup(response.text,'lxml')
type(html_soup)
zip_codes=html_soup.find_all('div', {'class' : 'rightc'})
And this is a snippet of the result from which I would like to only extract the zip codes..
[<div class="rightc">1000-246<hr/> 1050-138<hr/> 1069-188<hr/> 1070-204<hr/> 1100-069<hr/> 1100-329<hr/> 1100-591<hr/> 1150-144<hr/> 1169-062<hr/> 1170-128<hr/> 1170-395<hr/> 1200-228<hr/> 1200-604<hr/> 1200-862<hr/> 1250-111<hr/> 1269-121<hr/> 1300-217<hr/> 1300-492<hr/> 1350-092<hr/> 1399-014<hr/> 1400-237<hr/> 1500-061<hr/> 1500-360<hr/> 1500-674<hr/> 1600-232<hr/> 1600-643<hr/> 1700-018<hr/> 1700-302<hr/> 1750-113<hr/> 1750-464<hr/> 1800-262<hr/> 1900-115<hr/> 1900-401<hr/> 1950-208<hr/> 1990-162<hr/> 1000-247<hr/> 1050-139<hr/> 1069-190<hr/> 1070-205<hr/> 1100-070<hr/> 1100-330</div>]
Your result zip_codes has the type bs4.element.ResultSet, which is a set of bs4.element.Tag. So zip_codes[0] is what you're interested in (the first tag found). Use the .text method to strip the <hr> tags. Now you have a long string of zip codes separated by spaces. Strip them out into a list somehow (two options below, option one is more pythonic and faster).
from bs4 import BeautifulSoup as soup
from requests import get
url = 'https://worldpostalcode.com/portugal/lisboa/'
response = get(url)
html_soup = soup(response.text,'lxml')
zip_codes = html_soup.find_all('div', {'class' : 'rightc'})
# option one
zips = zip_codes[0].text.split(' ')
print(zips[:8])
# option two (slower)
zips = []
for zc in zip_codes[0].childGenerator():
zips.append(zc.extract().strip())
print(zips[:8])
Output:
['1000-246', '1050-138', '1069-188', '1070-204', '1100-069', '1100-329', '1100-591', '1150-144']
['1000-246', '1050-138', '1069-188', '1070-204', '1100-069', '1100-329', '1100-591', '1150-144']
html_soup = BeautifulSoup(htmlcontent,'lxml')
type(html_soup)
zip_codes=html_soup.find_all('div', {'class' : 'rightc'})
print(zip_codes[0].text.split(' '))
you can get the text and split it.
o/p :
[u'1000-246', u'1050-138', u'1069-188', u'1070-204',.........]
Use regex to grab the codes
from bs4 import BeautifulSoup
import requests
import re
url = 'https://worldpostalcode.com/portugal/lisboa/'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
element = soup.select_one('.codelist .rightc')
codes = re.findall(r"\d{4}-\d{3}",element.text)
for code in codes:
print(code)
I would suggest you to replace all the </hr>tags into some delimiter (i.e., # or $ or ,) before loading the page response as soup. Now the job will be so easy once you load it into the soup you can extract the zip codes as a list just by calling the class.
from bs4 import BeautifulSoup as soup
from requests import get
url='https://worldpostalcode.com/portugal/lisboa/'
response = get(url)
print(response.text.replace('<hr>', '#'))
html_soup = soup(response.text,'lxml')
type(html_soup)
zip_codes=html_soup.find_all('div', {'class' : 'rightc'})
zip_codes = zip_codes.text.split('#')
Hope this helps! Cheers!
P.S.: Answer is open for improvements and comments.
I'm trying to scrape a multiple page website with beautiful soup. The code works partially. It returns only the last one page instead of all pages. How can I fix the problem?
# import libraries
import urllib.request
from bs4 import BeautifulSoup
# specify the url
aziende = [
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
]
for azienda in aziende:
page_link = 'http://www.occhialeriabellunotreviso.it/' + azienda
page = urllib.request.urlopen(page_link ) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, 'html.parser') # parse the html using beautiful soup and store in variable `soup`
# Take out the <div> of name and get its value
name_box = soup.find('h2')
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)
Just put the final lines of code that are outside the for-loop inside the for-loop so they are run for every page.
# import libraries
import urllib.request
from bs4 import BeautifulSoup
# specify the url
aziende = [
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
]
for azienda in aziende:
page_link = 'http://www.occhialeriabellunotreviso.it/' + azienda
page = urllib.request.urlopen(page_link ) # query the website and return the html to the variable ‘page’
soup = BeautifulSoup(page, 'html.parser') # parse the html using beautiful soup and store in variable `soup`
# Take out the <div> of name and get its value
name_box = soup.find('h2')
name = name_box.text.strip() # strip() is used to remove starting and trailing
print (name)
There is nothing wrong with the way you have already tried except for the indentation. However, alternative approach could be something like below:
import urllib.request
from bs4 import BeautifulSoup
link = 'http://www.occhialeriabellunotreviso.it/{}'
aziende = (
'35-azienda-4_planets', '36-azienda-archivio_23', '24-azienda-bm', '16-azienda-brolese_virginio', '39-azienda-castellani', '19-azienda-centro_ottico_bisa', '25-azienda-comel_optik', '37-azienda-de_lorenzo_occhiali', '15-azienda-delta_laser', '34-azienda-dem', '21-azienda-erizzo', '3-azienda-evo', '27-azienda-farben_occhialeria', '32-azienda-gio__eyewear', '7-azienda-gipizeta', '42-azienda-h8', '20-azienda-idea_91', '5-azienda-lem', '41-azienda-lasertec', '22-azienda-le_thi_thu_thu', '28-azienda-m1', '1-azienda-mati_', '38-azienda-metal_dream', '30-azienda-mictu', '23-azienda-nete', '10-azienda-new_italian_design_eyewear', '31-azienda-okki_lux', '9-azienda-ottica_pra_floriani', '12-azienda-pao', '40-azienda-palladio_occhiali', '29-azienda-plastil_due', '17-azienda-punti_di_vista', '14-azienda-quemme', '4-azienda-red_line', '43-azienda-revert', '33-azienda-sm', '6-azienda-scussel', '8-azienda-sistem', '18-azienda-stile_italiano', '26-azienda-tecnodanta', '11-azienda-toffoli_costantino', '13-azienda-tri_color', '2-azienda-zago'
)
def get_item(url):
for azienda in aziende:
page = urllib.request.urlopen(url.format(azienda))
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('h2').get_text(strip=True)
yield name_box
if __name__ == '__main__':
for item in get_item(link):
print(item)
I was building a web-scraper using python.
The purpose of my scraper is to fetch all the links to websites from this webpage http://www.ebizmba.com/articles/torrent-websites
I want output like -
www.thepiratebay.se
www.kat.ph
I am new to python and scraping, and I was doing this just for practice. Please help me to get the right output.
My code --------------------------------------
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.ebizmba.com/articles/torrent-websites")
soup = BeautifulSoup(r.content, "html.parser")
data = soup.find_all("div", {"class:", "main-container-2"})
for item in data:
print(item.contents[1].find_all("a"))
My Output --- http://i.stack.imgur.com/Xi37B.png
If you are webscraping for practice, have a look at regular expressions.
This here would get just the headline links... The Needle string is the match string, the brackets (http://.*?) contain the match group.
import urllib2
import re
myURL = "http://www.ebizmba.com/articles/torrent-websites"
req = urllib2.Request(myURL)
Needle1 = '<p><a href="(http:.*?)" rel="nofollow" target="_blank">'
for match in re.finditer(Needle1, urllib2.urlopen(req).read()):
print(match.group(1))
Use .get('href') like this:
import requests
from bs4 import BeautifulSoup
r = requests.get("http://www.ebizmba.com/articles/torrent-websites")
soup = BeautifulSoup(r.text, "html.parser")
data = soup.find_all("div", {"class:", "main-container-2"})
for i in data:
for j in i.contents[1].find_all("a"):
print(j.get('href'))
Full output:
http://www.thepiratebay.se
http://siteanalytics.compete.com/thepiratebay.se
http://quantcast.com/thepiratebay.se
http://www.alexa.com/siteinfo/thepiratebay.se/
http://www.kickass.to
http://siteanalytics.compete.com/kickass.to
http://quantcast.com/kickass.to
http://www.alexa.com/siteinfo/kickass.to/
http://www.torrentz.eu
http://siteanalytics.compete.com/torrentz.eu
http://quantcast.com/torrentz.eu
http://www.alexa.com/siteinfo/torrentz.eu/
http://www.extratorrent.cc
http://siteanalytics.compete.com/extratorrent.cc
http://quantcast.com/extratorrent.cc
http://www.alexa.com/siteinfo/extratorrent.cc/
http://www.yify-torrents.com
http://siteanalytics.compete.com/yify-torrents.com
http://quantcast.com/yify-torrents.com
http://www.alexa.com/siteinfo/yify-torrents.com
http://www.bitsnoop.com
http://siteanalytics.compete.com/bitsnoop.com
http://quantcast.com/bitsnoop.com
http://www.alexa.com/siteinfo/bitsnoop.com/
http://www.isohunt.to
http://siteanalytics.compete.com/isohunt.to
http://quantcast.com/isohunt.to
http://www.alexa.com/siteinfo/isohunt.to/
http://www.sumotorrent.sx
http://siteanalytics.compete.com/sumotorrent.sx
http://quantcast.com/sumotorrent.sx
http://www.alexa.com/siteinfo/sumotorrent.sx/
http://www.torrentdownloads.me
http://siteanalytics.compete.com/torrentdownloads.me
http://quantcast.com/torrentdownloads.me
http://www.alexa.com/siteinfo/torrentdownloads.me/
http://www.eztv.it
http://siteanalytics.compete.com/eztv.it
http://quantcast.com/eztv.it
http://www.alexa.com/siteinfo/eztv.it/
http://www.rarbg.com
http://siteanalytics.compete.com/rarbg.com
http://quantcast.com/rarbg.com
http://www.alexa.com/siteinfo/rarbg.com/
http://www.1337x.org
http://siteanalytics.compete.com/1337x.org
http://quantcast.com/1337x.org
http://www.alexa.com/siteinfo/1337x.org/
http://www.torrenthound.com
http://siteanalytics.compete.com/torrenthound.com
http://quantcast.com/torrenthound.com
http://www.alexa.com/siteinfo/torrenthound.com/
https://demonoid.org/
http://siteanalytics.compete.com/demonoid.pw
http://quantcast.com/demonoid.pw
http://www.alexa.com/siteinfo/demonoid.pw/
http://www.fenopy.se
http://siteanalytics.compete.com/fenopy.se
http://quantcast.com/fenopy.se
http://www.alexa.com/siteinfo/fenopy.se/