Scraping data using BeautifulSoup - python

I'm trying scrape the data into a dictionary from this site,
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info.append(items)
however, the HTML tags are not being removed.

You need to use .text. Then to get in the way you want, would need to do a bit of string manipulation.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
url = 'https://webscraper.io/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
item = soup.find("div",{"class":"info%s" %x}).text.strip().replace('\n',': ')
info.append(item)
info = '\n'.join(info)
print (info)

Something like this might work? (Replace the webscraper.io url with your actual request URL; Also, you'd still need to clean up the \n characters from the output):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info += [item.text for item in items]
I.e. item.text, and concatenate the resulting array with info

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

Scraping multiple pages with Python and BeautifulSoup

I'm trying to scrape many pages in Python using BeautifulSoup but with no positive results.
I tried using request.get() and session.get(). The number of pages I should scrape is 92.
import requests
from bs4 import BeautifulSoup
import urllib.request
with requests.Session as session:
count = 0
for i in range(92):
count+=1
page = "https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(count)+".html"
r = session.get(page)
soup = BeautifulSoup(r.content)
Using print(page) the page are formatted corectly. But executing soup to print all the values stored in the variable, only the values of the first page are printed.
I'm using a jupyter notebook
you can do as below:
import requests
from bs4 import BeautifulSoup
import urllib.request
for i in range(92):
url = "https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(i)+".html"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
p = soup.select('p')
print(len(p))
This will work.
from bs4 import BeautifulSoup
import requests
count = 0
for i in range(92):
count +=1
source1 = requests.get("https://www.paginegialle.it/lazio/roma/dentisti/p-"+str(count)+".html").text
soup1 = BeautifulSoup(source1, 'lxml')
print(soup1.body)
print()
print("done")
Another solution.
from simplified_scrapy.request import req
from simplified_scrapy.simplified_doc import SimplifiedDoc
count = 0
for i in range(92):
count+=1
html = req.get('https://www.paginegialle.it/lazio/roma/dentisti/p-'+str(i)+'.html')
doc = SimplifiedDoc(html)
print(doc.select('title>text()'))
print (count)

BS4 + html, b Tag issue

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)
Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8
how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8
import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

Beautiful Soup PYTHON - inside tags

Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809

I want to crawl data from 1 to 10 pages automatically from website.How can i do it?

import requests
from bs4 import BeautifulSoup
My_Url = "http://questions.consumercomplaints.in/page/2"
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
From above code i scrapped (reviews/complaints) from web page 2.
How do i craw data automatically all pages (http://questions.consumercomplaints.in/page/3)
Why not surround your function in a ranged for loop?
import requests
from bs4 import BeautifulSoup
for i in range(3,11):
My_Url = "http://questions.consumercomplaints.in/page/" + str(i)
Data = requests.get(My_Url)
Soup = BeautifulSoup(Data.content)
head_id = Soup.find_all({"div":"href"})
len(head_id)
for i in head_id:
print i.text
Have look at how the range function works here.

Categories