Scraper only prints last page data instead of all pages - BS4 - python

I am scraping Trustpilot reviews, but data keeps getting overwritten with each iteration. How can I make it append all data from all pages instead of just the last one?
import re
import requests
import pandas as pd
from openpyxl import load_workbook
from bs4 import BeautifulSoup
def get_total_items(url):
soup = BeautifulSoup(requests.get(url, format(0),headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}).text, 'lxml')
stars = []
star1 = soup.find_all(attrs={"star-rating star-rating--medium"})
stars.append(star1)
df = pd.DataFrame(stars, ["Rating"])
return df
ddf = []
for i in range(29):
urls = "https://www.trustpilot.com/review/www.pandora.net?page={}"
get_total_items(urls).append(ddf)
print(ddf)

Change the for loop like below:
for i in range(29):
urls = "https://www.trustpilot.com/review/www.pandora.net?page={}"
ddf.append(get_total_items(urls.format(i)))

Related

How do I scrap all movie title, date and reviews on the website below? https://www.nollywoodreinvented.com/list-of-all-reviews

I have tried with the code below and what the code does is to bring the first page and does not load completely the reviews for the movies. I am interested in getting all the movie titles, movie dates, and reviews.
enter code here
from bs4 import BeautifulSoup
import requests
url = 'https://www.nollywoodreinvented.com/list-of-all-reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
movie_div = soup.find_all('div', class_='article-panel')
title=[]
for div in movie_div:
images= div.find_all('div', class_='article-image-wrapper')
for image in images:
image = image.find_all('div', class_='article-image')
for img in image:
title.append(img.a.img['title'])
date =[]
for div in movie_div:
date.append(div.find('div', class_='authorship type-date').text.strip())
info =[]
for div in movie_div:
info.append(div.find('div', class_='excerpt-text').text.strip())
import pandas as pd
movie = pd.DataFrame({'title':title, 'date':date, 'info':info}, index=None)
movie.head()
There is a backend api which serves up the HTML you are scraping you can see it in action if you open your browsers Developer Tools - Network tab - fetch/Xhr and click the on a the 2nd or 3rd page link, we can recreate the POST request with python like the below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
pages = 3
results_per_page = 500 #max 500 I think
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nollywoodreinvented.com/wp-admin/admin-ajax.php'
output = []
for page in range(1,pages+1):
payload = {
'action':'itajax-sort',
'view':'grid',
'loop':'main loop',
'location':'',
'thumbnail':'1',
'rating':'1',
'meta':'1',
'award':'1',
'badge':'1',
'authorship':'1',
'icon':'1',
'excerpt':'1',
'sorter':'recent',
'columns':'4',
'layout':'full',
'numarticles':str(results_per_page),
'largefirst':'',
'paginated':str(page),
'currentquery[category__in][]':'2648',
'currentquery[category__in][]':'2649'
}
resp = requests.post(url,headers=headers,data=payload).json()
print(f'Scraping page: {page} - results: {results_per_page}')
soup = BeautifulSoup(resp['content'],'html.parser')
for film in soup.find_all('div',class_='article-panel'):
try:
title = film.find('h3').text.strip()
except AttributeError:
continue
date = datetime.strptime(film.find('span',class_='date').text.strip(),"%B %d, %Y").strftime('%Y-%m-%d')
likes = film.find('span',class_='numcount').text.strip()
if not likes:
likes = 0
full_stars = [1 for _ in film.find_all('span',class_='theme-icon-star-full')]
half_stars = [0.5 for _ in film.find_all('span',class_='theme-icon-star-half')]
stars = (sum(full_stars)+ sum(half_stars))/2.0
item = {
'title':title,
'date':date,
'likes':likes,
'stars':stars
}
output.append(item)
df= pd.DataFrame(output)
df.to_csv('nollywood_data.csv',index=False)
print('Saved to nollywood_data.csv')

BeautifulSoup organize data into dataframe table

I have been working with BeautifulSoup to try and organize some data that I am pulling from an website (html) I have been able to boil the data down but am getting stuck on how to:
eliminate not needed info
organize remaining data to be put into a pandas dataframe
Here is the code I am working with:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
headers = requests.utils.default_headers()
headers.update({
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
})
url = 'https://www.apartments.com/lehi-ut/1-bedrooms/'
page = requests.get(url,headers = headers)
soup = bs(page.text)
names = soup.body.findAll('tr')
function_names = re.findall('th class="\w+', str(names))
function_names = [item[10:] for item in function_names]
description = soup.body.findAll('td')
#description = re.findall('td class="\w+', str(description))
data = pd.DataFrame({'Title':function_names,'Info':description})
The error I have been getting is that the array numbers don't match up, which I know to be true but when I un-hashtag out the second description line it removes the numbers I want from there and even then the table isn't organizing itself properly.
What I would like the output to look like is:
(headers) title: location | studio | 1 BR | 2 BR | 3 BR
(new line) data : Lehi, UT| $1,335 |$1,309|$1,454|$1,580
That is really all that I need but I can't get BS or Pandas to do it properly.
Any help would be greatly appreciated!
Try the following approach. It first extracts all of the data in the table and then transposes it (columns swapped with rows):
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = 'https://www.apartments.com/lehi-ut/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[1:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
print(df)
Giving you the following kind of output:
Studio 1 BR 2 BR 3 BR
0 0 729 1,041 1,333
1 $1,335 $1,247 $1,464 $1,738

Pandas return empty dataframe when trying to scrape table

I'm trying to get the transfer history of the top 500 most valuable players on Transfermarkt. I've managed (with some help) to loop through each players profile and scraped image and name. Now I want the transfer history, which can be found in a table on each players profile: Player Profile
I want to save the table in a dataframe, using Pandas and then write it to a CSV, with Season, Date etc as headers. For Monaco and PSG, for example, I just want the names of the clubs, not pictures or Nationality. But right now, all I get is this:
Empty DataFrame
Columns: []
Index: []
Expected output:
Season Date Left Joined MV Fee
0 18/19 Jul 1, 2018 Monaco PSG 120.00m 145.00m
I've viewed the source and inspected the page, but can't find anything that helps me, apart from that the tbody and tr. But the way I'm doing it I want to precise that table, since there are several others.
This is my code:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
site = "https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?ajax=yw1&page={}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
result = []
def main(url):
with requests.Session() as req:
result = []
for item in range(1, 21):
print(f"Collecting Links From Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
tr = soup.find_all("tbody")[1].find_all("tr", recursive=False)
result.extend([
{
"Season": t[1].text.strip()
}
for t in (t.find_all(recursive=False) for t in tr)
])
df = pd.DataFrame(result)
print(df)
import requests
from bs4 import BeautifulSoup
import pandas as pd
site = "https://www.transfermarkt.com/spieler-statistik/wertvollstespieler/marktwertetop?ajax=yw1&page={}"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url):
with requests.Session() as req:
links = []
names = []
for item in range(1, 21):
print(f"Collecting Links From Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
urls = [f"{url[:29]}{item.get('href')}" for item in soup.findAll(
"a", class_="spielprofil_tooltip")]
ns = [item.text for item in soup.findAll(
"a", class_="spielprofil_tooltip")][:-5]
links.extend(urls)
names.extend(ns)
return links, names
def parser():
links, names = main(site)
for link, name in zip(links, names):
with requests.Session() as req:
r = req.get(link, headers=headers)
df = pd.read_html(r.content)[1]
df.loc[-1] = name
df.index = df.index + 1
df.sort_index(inplace=True)
print(df)
parser()

How do I convert a web-scraped table into a csv?

A year ago I learned some python in one of my classes but haven't had to use much since then so this may or may not be a simple question.
I'm trying to web-scrape the top grossing films of all time table from Box Office Mojo and I want to grab the rank, title, and gross for the top 10 films in the 2010s. I've been playing around in python and I can get the entire table into python but I don't know how to manipulate it from there, let alone write out a csv file. Any guidance/tips?
Here is what will print the entire table for me (the first few lines are copied from an old web-scraping assignment to get me started):
import bs4
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page_html = requests.get(url, headers=headers)
page_soup = soup(page_html.text, "html.parser")
boxofficemojo_table = page_soup.find("div", {"class": "a-section imdb-scroll-table-inner"})
complete_table = boxofficemojo_table.get_text()
print(complete_table)`
You Can use pd.read_html for this.
import pandas as pd
Data = pd.read_html(r'https://www.boxofficemojo.com/chart/top_lifetime_gross/')
for data in Data:
data.to_csv('Data.csv', ',')
2.Using Bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests
URL = r'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
print('\n>> Exctracting Data using Beautiful Soup for :'+ URL)
try:
res = requests.get(URL)
except Exception as e:
print(repr(e))
print('\n<> URL present status Code = ',(res.status_code))
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.dropna(axis = 0, how = 'all',inplace = True)
print(Data.head(10))
Data.to_csv('Table.csv')

Can't get a table from a web page

I'm using BeautifulSoup to try to get the whole table of all 2000 companies from this URL:
https://www.forbes.com/global2000/list/#tab:overall.
This is the code I have written:
from bs4 import BeautifulSoup
import urllib.request
html_content = urllib.request.urlopen('https://www.forbes.com/global2000/list/#header:position')
soup = BeautifulSoup(html_content, 'lxml')
table = soup.find_all('table')[0]
new_table = pd.DataFrame(columns=range(0,7), index = [0])
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
new_table.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
new_table
In the result, I get only the names of the columns, but not the table itself.
How can I get the whole table.
The content is generated via javascript, so you can must selenium to mimic a browser and scroll movements, and then parse the page source with beautiful soup, or, in some cases, like this one, you can access those values by querying their ajax API:
import requests
import json
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0'}
target = 'https://www.forbes.com/ajax/list/data?year=2017&uri=global2000&type=organization'
with requests.Session() as s:
s.headers = headers
data = json.loads(s.get(target).text)
print([x['name'] for x in data[:5]])
Output (first 5 items):
['3M', '3i Group', '77 Bank', 'AAC Technologies Holdings', 'ABB']

Categories