Extracting similar items from a website with beautiful soup - python

I`m trying to scrape a website rating. I want to get each individual rating and it´s particular date. However, I only get one result in my list, although there should be several.
Am I doing something wrong in the for loop?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
url = "https://www.kununu.com/de/heidelpay/kommentare"
while url != " ":
print(url)
time.sleep(15)
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
print(r.status_code)
soup = BeautifulSoup(r.text, "html.parser")
#print(soup.prettify())
#Get overall score of the company
score_avg = soup.find("span", class_="index__aggregationValue__32exy").text
print(score_avg)
#get individuel scores and dates of the company
rating_list = []
for box in soup.find_all(".index__rating__3nC2L"):
score_ind = box.select(".index__score__16yy9").text
date = select(".index__date__eIOxr").text
rating = [score_ind, date]
rating_list.append(rating)
print(rating_list)
3,3
[['5,0', 'Januar 2017']]
Many thanks in advance!

It looks like you aren't appending the rating to the rating_list until the last loop is done. Is the printed rating perchance the very last one?
Add the append to your loop, like so:
for box in soup.find_all(".index__rating__3nC2L"):
score_ind = box.select(".index__score__16yy9").text
date = select(".index__date__eIOxr").text
rating = [score_ind, date]
rating_list.append(rating)

Well, the problem is that you're just appending the last rating value in rating_list.append(rating) because it's out of the foor loop, so what you have to do is this:
for box in soup.find_all(".index__rating__3nC2L"):
score_ind = box.select(".index__score__16yy9").text
date = select(".index__date__eIOxr").text
rating = [score_ind, date]
rating_list.append(rating)
Like this way you're gonna append each rating value in each iteration of the forloop. Hope this can help you

Related

web-scrape: get H4 attributes & href

I am trying to web-scrape a website. But I can get access to the attributes of some fields.
here is the code i used:
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
scrap_list = pd.DataFrame()
for path in range(10): # scroll over the categories
for path in range(10): # scroll over the pages
url = 'https://www.samehgroup.com/index.php?route=product/category'+str(page)+'&'+'path='+ str(path)
req = urllib3.PoolManager()
res = req.request('GET', URL)
soup = BeautifulSoup(res.data, 'html.parser')
soup.findAll('h4', {'class': 'caption'})
# extract names
scrap_name = [i.text.strip() for i in soup.findAll('h2', {'class': 'caption'})]
scrap_list['product_name']=pd.DataFrame(scrap_name,columns =['Item_name'])
# extract prices
scrap_list['product_price'] = [i.text.strip() for i in soup.findAll('div', {'class': 'price'})]
product_price=pd.DataFrame(scrap_price,columns =['Item_price'])
I want an output that provides me with each product and its price. I still can't get that right.
Any help would be very much appreciated.
I think the problem here was looping through the website pages. I got the code below working by first making a list of urls containing numbered 'paths' corresponding to pages on the website. Then looping through this list and applying a page number to the url.
If you wanted to only get all the products from a certain page, this page can be selected from the urlist and by index.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
urlist = [] #create list of usable url's to iterate through,
for i in range(1,10): # 9 pages equal to pages on website
urlist.append('https://www.samehgroup.com/index.php?route=product/category&path=' + str(i))
namelist = []
newprice = []
for urlunf in urlist: #first loop to get 'path'
for n in range(100): #second loop to get 'pages'. set at 100 to cover website max page at 93
try: #try catches when pages containing products run out.
url = urlunf + '&page=' + str(n)
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
products = soup.find_all('div', class_='caption')
for prod in products: #loops over returned list of products for names and prices
name = prod.find('h4').text
newp = prod.find('p', class_='price').find('span', class_='price-new').text
namelist.append(name) #append data to list outside of loop
newprice.append(newp)
time.sleep(2)
except AttributeError: #if there are no more products it will move to next page
pass
df = pd.DataFrame() #create df and add scraped data
df['name'] = namelist
df['price'] = newprice

Web scraping with bs4 python: How to display football matchups

I'm a beginner to Python and am trying to create a program that will scrape the football/soccer schedule from skysports.com and will send it through SMS to my phone through Twilio. I've excluded the SMS code because I have that figured out, so here's the web scraping code I am getting stuck with so far:
import requests
from bs4 import BeautifulSoup
URL = "https://www.skysports.com/football-fixtures"
page = requests.get(URL)
results = BeautifulSoup(page.content, "html.parser")
d = defaultdict(list)
comp = results.find('h5', {"class": "fixres__header3"})
team1 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side1"})
date = results.find('span', {"class": "matches__date"})
team2 = results.find('span', {"class": "matches__item-col matches__participant matches__participant--side2"})
for ind in range(len(d)):
d['comp'].append(comp[ind].text)
d['team1'].append(team1[ind].text)
d['date'].append(date[ind].text)
d['team2'].append(team2[ind].text)
Down below should do the trick for you:
from bs4 import BeautifulSoup
import requests
a = requests.get('https://www.skysports.com/football-fixtures')
soup = BeautifulSoup(a.text,features="html.parser")
teams = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="swap-text--bp30")[1:]: #skips the first one because that's a heading
teams.append(i.text)
date = soup.find(class_="fixres__header2").text
print(date)
teams = [i.strip('\n') for i in teams]
for x in range(0,len(teams),2):
print (teams[x]+" vs "+ teams[x+1])
Let me further explain what I have done:
All the football have this class name - swap-text--bp30
So we can use find_all to extract all the classes with that name.
Once we have our results we can put them into an array "teams = []" then append them in a for loop "team.append(i.text)". ".text" strips the html
Then we can get rid of "\n" in the array by stripping it and printing out each string in the array two by two.
This should be your final output:
EDIT: To scrape the title of the leagues we will do pretty much the same:
league = []
for date in soup.find_all(class_="fixres__header2"): # searching in that date
for i in soup.find_all(class_="fixres__header3"): #skips the first one because that's a heading
league.append(i.text)
Strip the array and create another one:
league = [i.strip('\n') for i in league]
final = []
Then add this final bit of code which is essentially just printing the league then the two teams over and over:
for x in range(0,len(teams),5):
final.append(teams[x]+" vs "+ teams[x+1])
for i in league:
print(i)
for i in final:
print(i)

Python - Beautiful Soup: Webscraping PubMed - extracting PMIDs (an article ID), adding to list, and preventing duplicate scraping

I want to extract research abstracts on PubMed. I will have multiple URLs to search for publications and some of them will have the same articles as others. Each article has a unique ID called a PMID. Basically, the abstract of each URL is a substring + the PMID (example: https://pubmed.ncbi.nlm.nih.gov/ + 32663045). However, I don't want to extract the same article twice for multiple reasons (i.e., takes longer to complete the entire code, uses up more bandwidth), so once I extract the PMID, I add it to a list. I'm trying to make my code only extract information from the abstract just once, however my code is still extracting duplicate PMIDs and publication titles.
I know how to get rid of duplicates in Pandas in my output, but that's not what I want to do. I want to basically skip over PMIDs/URLs that I already scraped.
Current Output
Title| PMID
COVID-19 And Racial/Ethnic Disparities In Health Risk | 32663045
The Risk Of Severe COVID-19 | 32941086
COVID-19 And Racial/Ethnic Disparities In Health Risk | 32663045
The Risk Of Severe COVID-19 | 32941086
Desired Output
Title| PMID
COVID-19 And Racial/Ethnic Disparities In Health Risk | 32663045
The Risk Of Severe COVID-19 | 32941086
Here's my code:
from bs4 import BeautifulSoup
import csv
import time
import requests
import pandas as pd
all_pmids = []
out = []
search_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=%28AHRQ%5BAffiliation%5D%29+AND+%28COVID-19%5BText+Word%5D%29&sort=','https://pubmed.ncbi.nlm.nih.gov/?term=%28AHRQ%5BAffiliation%5D%29+AND+%28COVID-19%5BText+Word%5D%29&sort=']
for search_url in search_urls:
response = requests.get(search_url)
soup = BeautifulSoup(response.content, 'html.parser')
pmids = soup.find_all('span', {'class' : 'docsum-pmid'})
for p in pmids:
p = p.get_text()
all_pmids.append(p) if p not in all_pmids else print('project already in list, skipping')
for pmid in all_pmids:
url = 'https://pubmed.ncbi.nlm.nih.gov/'+pmid
response2 = requests.get(url)
soup2 = BeautifulSoup(response2.content, 'html.parser')
title = soup2.select('h1.heading-title')[0].text.strip()
data = {'title': title, 'pmid': pmid, 'url':url}
time.sleep(3)
out.append(data)
df = pd.DataFrame(out)
df.to_excel('my_results.xlsx')
Just an indentation error, or more accurately, where you are running your two for loops. If it isn't just an overlooked mistake, read the explanation. If it is just a mistake, unindent your second for loop.
Because you are searching all_pmids within your larger search_url loop without resetting it after each search, it finds the first two pmids, adds them to all_pmids, then runs the next loop for those two.
In the second run of the outer loop, it finds the next two pmids, sees they're already in ```all_pmids`` so doesn't add them, but still runs the inner loop on the first two still stored in the list.
You should run the inner loop separately, as such:
from bs4 import BeautifulSoup
import csv
import time
import requests
import pandas as pd
all_pmids = []
out = []
search_urls = ['https://pubmed.ncbi.nlm.nih.gov/?term=%28AHRQ%5BAffiliation%5D%29+AND+%28COVID-19%5BText+Word%5D%29&sort=','https://pubmed.ncbi.nlm.nih.gov/?term=%28AHRQ%5BAffiliation%5D%29+AND+%28COVID-19%5BText+Word%5D%29&sort=']
for search_url in search_urls:
response = requests.get(search_url)
soup = BeautifulSoup(response.content, 'html.parser')
pmids = soup.find_all('span', {'class' : 'docsum-pmid'})
for p in pmids:
p = p.get_text()
all_pmids.append(p) if p not in all_pmids else print('project already in list, skipping')
for pmid in all_pmids:
url = 'https://pubmed.ncbi.nlm.nih.gov/'+pmid
response2 = requests.get(url)
soup2 = BeautifulSoup(response2.content, 'html.parser')
title = soup2.select('h1.heading-title')[0].text.strip()
data = {'title': title, 'pmid': pmid, 'url':url}
time.sleep(3)
out.append(data)
df = pd.DataFrame(out)
df.to_excel('my_results.xlsx')
You should move the for pmid in all_pmids loop outside the for search_url in search_urls loop
...
for search_url in search_urls:
response = requests.get(search_url)
soup = BeautifulSoup(response.content, 'html.parser')
pmids = soup.find_all('span', {'class' : 'docsum-pmid'})
for p in pmids:
p = p.get_text()
all_pmids.append(p) if p not in all_pmids else print('project already in list, skipping')
## move this for loop outside!!
for pmid in all_pmids:
url = 'https://pubmed.ncbi.nlm.nih.gov/'+pmid
response2 = requests.get(url)
soup2 = BeautifulSoup(response2.content, 'html.parser')
...

Scraping website with BS4 // accessing class

I am tring to extract different information from websites with BeautifulSoup, such as title of the product and the price.
I do that with different urls, looping through the urls with for...in.... Here, I'll just provide a snippet without the loop.
from bs4 import BeautifulSoup
import requests
import csv
url= 'https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
price = soup.find('meta', property="product:price:amount")
title = soup.find("div", {"class": "flix-model-name"})
title2 = soup.find('div', class_="flix-model-name")
title3 = soup.find("div", attrs={"class": "flix-model-name"})
print(price['content'])
print(title)
print(title2)
print(title3)
So from this URL https://www.mediamarkt.ch/fr/product/_lg-oled65gx6la-1991479.html I wasnt to extract the product number. the only place I find it is in the div class="flix-model-name". However, I am totally unable to reach it. I tried different ways to access it in the title, title2, title3, but I always have the output none.
I am a bit of a beginner, so I guess I am probably missing something basic... If so, please pardon me for that.
Any help is welcome! Many thanks in advance!
just for info, with each url I thought of appending the data and write them on a CSV file like that:
for url in urls:
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
row=[]
try:
# title = YOUR VERY WELCOMED ANSWER
prices = soup.find('meta', property="product:price:amount")
row = (title.text+','+prices['content']+'\n')
data.append(row)
except:
pass
file = open('database.csv','w')
i = 0
while i < (len(data)):
file.write(data[i])
i +=1
file.close()
Many thanks in advance for your help!
David
Try below approach using python - requests simple, straightforward, reliable, fast and less code is required when it comes to requests. I have fetched the API URL from website itself after inspecting the network section of google chrome browser.
What exactly below script is doing:
First it will take the API URL, create the URL based on 2 dynamic parameters(product and category) and then do GET request to get the data.
After getting the data script will parse the JSON data using json.loads library.
Finally, it will iterate all over the list of products one by one and print the details which are divided in 2 categotries 'box1_ProductToProduct' and 'box2_KategorieTopseller' like Brand, Name, Product number and Unit price. Same way you can add more details by looking in to the API call.
import json
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def scrap_product_details():
PRODUCT = 'MMCH1991479' #Product number
CATEGORY = '680942' #Category number
URL = 'https://www.mediamarkt.ch/rde_server/res/MMCH/recomm/product_detail/sid/WACXyEbIf3khlu6FcHlh1B1?product=' + PRODUCT + '&category=' + CATEGORY # dynamic URL
response = requests.get(URL,verify = False) #GET request to fetch the data
result = json.loads(response.text) # Parse JSON data using json.loads
box1_ProductToProduct = result[0]['box1_ProductToProduct'] # Extracted data from API
box2_KategorieTopseller = result[1]['box2_KategorieTopseller']
for item in box1_ProductToProduct: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
for item in box2_KategorieTopseller: # loop over extracted data
print('-' * 100)
print('Brand : ',item['brand'])
print('Name : ',item['name'])
print('Net Unit Price : ',item['netUnitPrice'])
print('Product Number : ',item['product_nr'])
print('-' * 100)
scrap_product_details()

How to extract a span tag inside div another tag

I have written a code in python using Beautiful Soup for extracting user name and their rating from IMDB. But there are many user who did not gave rating for their reviews. Its become difficult to map exactly ratings with their reviews. So how can i do this part?
http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt
In this url reviews are not assign rating.
url1 ="http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
response = requests.get(url1, headers=headers)
page=response.content
soup=BeautifulSoup(page)
for k in soup.findAll('div',{"class":"load-more-data"}):
if k.name == 'span' and m['class'] == "rating-other-user-rating":
print blah()
else:
print blah 1()
This is the code to check whether rating part exist in review part or not but it did not returning any thing?
The information you're looking for (username, rating) is located in 'div.review-container' tags.
About the tags that have no rating, you can just ignore them.
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
print name, rating
The information that shows when you press the Load More button is loaded via XHR requests.
You'll find the all data you need in order to preform the request in a 'div.load-more-data' tag.
load_more = soup.find('div', class_='load-more-data')
url = 'http://www.imdb.com{}?paginationKey={}'.format(
load_more['data-ajaxurl'], load_more['data-key']
)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Just repeat the above process untill you have all the info.
import requests
from bs4 import BeautifulSoup
url = "http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
ajax_url = url.split('?')[0] + "/_ajax?paginationKey={}"
reviews = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
reviews.append([name, rating])
print name, rating
load_more = soup.find('div', class_='load-more-data')
if not load_more:
break
url = ajax_url.format(load_more['data-key'])
I suggest you should try to console the content from <div class="review-container" ... of every review. Then select the specific data you want to retrieve.

Categories