the script that I have here finds the rating of a movie from parsed HTML. What if I wanted to say find a top 50 list by genre and get the output in JSON format (movie: rating), how do I get this info in python? use API or some other method?
import requests
from bs4 import BeautifulSoup
URL = "google.com/search?q={}"
def find_rating(name):
ratings = {}
r = requests.get(URL.format(name))
s = BeautifulSoup(r.text, "html.parser")
n = s.find_all("div", class_ = "sDYTm")
for i in n:
d = i.text.split(".")
ratings[d[1]] = d[0]
return ratings
if __name__ == "__main__":
movie ="Good WIll Hunting"
rating = find_rating(movie)
print(rating)
Related
Trying to get the "all splits" line of numbers from https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame (html code is in the picture) my code returns the 'all splits' text instead of the numbers I'm looking for. How do I go about changing the lookups in the GetStats function area to get the numbers instead of the first column descriptors.
import requests
from bs4 import BeautifulSoup
import re
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import csv
urls = []
data = []
for year in range(2003, 2005):
for page in range(1, 9):
url = f'http://www.espn.com/nba/hollinger/statistics/_/page/{page}/year/{year}/qualified/false'
if url is not None:
urls.append(url)
def GetData(url):
names_list = [] # names of players
pers = [] # player efficency ratings
playeridlist = [] # list of player ids to be used in making new stats searchable url
statsurls = [] # list of urls generated to get player stats
# makes a pattern for the function to look for
pattern = re.compile('playerId=(\d+)')
# setsup soup function
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
# finds players names and adds to list
names = soup.find(lambda tag: tag.name == 'a' and 'playerId' in tag['href'])
bodytext = names.text
names_list.append(bodytext)
# finds plays player efficency rating and adds to list
pertag = soup.find('td', class_='sortcell')
per = pertag.text
pers.append(per)
# finds player id
names = soup.find('a', href=pattern)
player_id = names['href'].split('playerId=')[1]
playeridlist.append(player_id)
# uses player id to make a list of new urls for that player and get stats
for player_id in playeridlist:
statsurl = f"https://insider.espn.com/nba/player/splits/_/id/{player_id}/type/nba/year/{year}/category/perGame"
if statsurl is not None:
statsurls.append(statsurl)
# parses stats to get stats
def GetStats(statsurl): # GO BACK AND MAKE A THREAD EXECUTER STATEMENT WITHIN GETDATA FUNCTION BELOW THIS!!!
statsreq = requests.get(statsurl)
statssoup = BeautifulSoup(statsreq.text, 'lxml')
focusing_search = statssoup.find('tr', class_='Table__TR Table__TR--sm Table__even', attrs={'data-idx': '1'})
playerstathtml = focusing_search.find('td', class_='Table__TD')
stat_values = [playerstats.text for playerstats in playerstathtml]
print(stat_values)
GetStats("https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame")
#name_and_stats_list = dict(map(lambda i, j: (i, j), names_list, pers))
print(f"{bodytext}: {per}")
print(player_id)
GetData('http://www.espn.com/nba/hollinger/statistics/_/page/1/year/2003/qualified/false')
To get the all_splits stats from:
https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame
This is what I did:
I grabbed the table body using soup.select
Then I grabbed the headings and relevant stats by iterating through the columns/rows.
The list comprehension provides the text in list format, which is easy to convert to a dataframe.
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://insider.espn.com/nba/player/splits/_/id/532/type/nba/year/2003/category/perGame'
soup = BeautifulSoup(requests.get(url).content, "html.parser")
t = soup.select('main#fittPageContainer div.Table__Scroller > table > tbody')
headings = [h.text for h in t[0].find_next('tr').find_all('td')]
all_splits = [h.text for h in t[0].find_all('tr')[1].find_all('td')]
df = pd.DataFrame([all_splits], columns=headings)
print(df)
Output:
How do I filter out certain skills like 'django' and 'Django' from a collection of skills provided by users through an input form using a Python function?
I've requests and bs4 to get the raw data, but I need to filter through the results. Here's my code so far:
from bs4 import BeautifulSoup
import requests
import time
unfamiliar_skills = list(map(str,input('>')))
def find_jobs():
html_text = requests.get('https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation=').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
# we first created the parsing for one output, then used for loop to parse multiple instances of that.
for index, job in enumerate(jobs):
published_date = job.find('span', class_ = 'sim-posted').span.text # must be b 1st to prevent scraping if the pub date is not == few days ago
if 'few' in published_date:
company_name = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ','')
skills = job.find('span', class_ = 'srp-skills').text.replace(' ','')
more_info = job.header.h2.a['href'] # like in a dictionary
if filter(unfamiliar_skills, skills):
with open(f'C:/Users/USER/{index}.txt', 'w') as f:
f.write(f'Company Name: {company_name.strip()} \n')
f.write(f'Required Skills: {skills.strip()} \n')
f.write(f'more_info: {more_info} \n')
print(f'File saved: {index}')
if __name__ == '__main__':
while True:
find_jobs()
time_wait = 10
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait*60)
Here is the printed output of skills variable:
rest,python,database,django,debugging,mongodb
python,webtechnologies,linux,mobile,mysql,angularjs,javascript
rest,python,security,debugging
python,docker,messaging,pythonscripting
python,git,django
python,database,django,mysql,api
python,hadoop,machinelearning
rest,python,django,git
python,django,,framework
python,java,scala
python,linux,windows,sql
python,webdeveloper,webservices
rest,python,database,django,api
Python,Django,Flask
python,django,javascript,webprogramming
python,Django,ObjectRelationalMapper
python,webtechnologies,webtechnologies
python,django,html5,javascript
python,django,html5,javascript
None
Here is my code:
So I wanted to extract all the bollywood movies, and the project requires, movie titles, cast, crew, IMDB id etc.... I am not able to get all the IMDb IDs with the error nonetype. When I used it on one page only it was working quite well, however, when I use it on multiple pages it shows an error. Please help
#importing the libraries needed
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
from random import randint
#declaring the list of empty variables, So that we can append the data overall
movie_name = []
year = []
time=[]
rating=[]
votes = []
description = []
director_s = []
starList= []
imdb_id = []
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=num_votes,desc&start=1&ref_=adv_nxt"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
name = store.h3.a.text
movie_name.append(name)
year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text
year.append(year_of_release)
runtime = store.p.find("span", class_ = 'runtime').text if store.p.find("span", class_ = 'runtime') else " "
time.append(runtime)
rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if store.find('div', class_ = "inline-block ratings-imdb-rating") else " "
rating.append(rate)
value = store.find_all('span', attrs = {'name': "nv"})
vote = value[0].text if store.find_all('span', attrs = {'name': "nv"}) else " "
votes.append(vote)
# Description of the Movies
describe = store.find_all('p', class_ = 'text-muted')
description_ = describe[1].text.replace('\n', '') if len(describe) > 1 else ' '
description.append(description_)
## Director
ps = store.find_all('p')
for p in ps:
if 'Director'in p.text:
director =p.find('a').text
director_s.append(director)
## ID
imdbID = store.find('span','rating-cancel').a['href'].split('/')[2]
imdb_id.append(imdbID)
## actors
star = store.find("p", attrs={"class":""}).text.replace("Stars:", "").replace("\n", "").replace("Director:", "").strip()
starList.append(star)
Error:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17576/2711511120.py in <module>
63
64 ## IDs
---> 65 imdbID = store.find('span','rating-cancel').a['href'].split('/')[2] if store.find('span','rating-cancel').a['href'].split('/')[2] else ' '
66 imdb_id.append(imdbID)
67
AttributeError: 'NoneType' object has no attribute 'a'
Change your condition to the following, cause first you have to check if <span> exists:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
Example
Check the url, here are some of the <span> missing:
import requests
from bs4 import BeautifulSoup
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=my_ratings,desc"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.find_all('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
print(imdbID)
Output
tt9900050
tt9896506
tt9861220
tt9810436
tt9766310
tt9766294
tt9725058
tt9700334
tt9680166
tt9602804
Even better scrape the id via image tag cause these is always there even if there is only the placholder:
imdbID = store.img.get('data-tconst')
I would like to extract for each movie at least 20 user reviews, but I don't know how to loop to get into the IMDb title movie and then to the user reviews with beautifulsoup.
start link = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250";
title_link(1) = "https://www.imdb.com/title/tt7131622/?ref_=adv_li_tt";
user_reviews_link_movie1 = "https://www.imdb.com/title/tt7131622/reviews?ref_=tt_ov_rt" ;
I am able to extract from a static page titles, years, ratings and metascores of each movie of the list.
# Import packages and set urls
from requests import get
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
import pandas as pd
test_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores})
test_df
Actual results :
movie year imdb metascore
Once Upon a Time... in Hollywood (2019) (8.1) (83)
Scary Stories (2019) (6.5) (61)
Fast & Furious: Hobbs & Shaw (2019) (6.8) (60)
Avengers: Endgame (2019) (8.6) (78)
Expected :
movie1 year1 imbd1 metascore1 review1
movie1 year1 imbd1 metascore1 review2
...
movie1 year1 imbd1 metascore1 review20
movie2 year2 imbd2 metascore2 review1
...
movie2 year2 imbd2 metascore2 review20
...
movie250 year250 imbd250 metascore250 review20
Assuming that answer on my question in comments is "yes".
Below is a solution to your initial request.
There's a check whether a particular film really has 20 reviews. If less, then gather all available ones.
Technically parsing process is correct, I checked it when assigned movie_containers = movie_containers[:3]. Gathering all data will take some time.
UPDATE: just finished collecting info on all 250 films - everything is scraped without errors, so block after solution itself is just FYI.
Also if you want to go further with your parsing, I mean collect data for next 250 films and so on, you can add one more looping level to this parser. The process is similar to one in the "Reviews extracting" section.
# Import packages and set urls
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
url_header_for_reviews = 'https://www.imdb.com'
url_tail_for_reviews = 'reviews?ref_=tt_urv'
base_response = get(base_url)
html_soup = BeautifulSoup(base_response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
result_df = pd.DataFrame()
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Reviews extracting
num_reviews = 20
# Getting last piece of link puzzle for a movie reviews` link
url_middle_for_reviews = container.find('a')['href']
# Opening reviews page of a concrete movie
response_reviews = get(url_header_for_reviews + url_middle_for_reviews + url_tail_for_reviews)
reviews_soup = BeautifulSoup(response_reviews.text, 'html.parser')
# Searching all reviews
reviews_containers = reviews_soup.find_all('div', class_ = 'imdb-user-review')
# Check if actual number of reviews is less than target one
if len(reviews_containers) < num_reviews:
num_reviews = len(reviews_containers)
# Looping through each review and extracting title and body
reviews_titles = []
reviews_bodies = []
for review_index in range(num_reviews):
review_container = reviews_containers[review_index]
review_title = review_container.find('a', class_ = 'title').text.strip()
review_body = review_container.find('div', class_ = 'text').text.strip()
reviews_titles.append(review_title)
reviews_bodies.append(review_body)
# The name
name = container.h3.a.text
names = [name for i in range(num_reviews)]
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years = [year for i in range(num_reviews)]
# The IMDB rating
imdb_rating = float(container.strong.text)
imdb_ratings = [imdb_rating for i in range(num_reviews)]
# The Metascore
metascore = container.find('span', class_ = 'metascore').text
metascores = [metascore for i in range(num_reviews)]
# Gathering up scraped data into result_df
if result_df.empty:
result_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies})
elif num_reviews > 0:
result_df = result_df.append(pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies}))
Btw I'm not sure that IMDB will let you gather data for all films in a loop as is. There's a possibility that you can get a captcha or redirection to some other page. If these issue appears,I'd go with a simple solution - pauses in scraping and/or changing user-agents.
Pause (sleep) can be implemented as follows:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
Inserting a user-agent in request can be done as follows:
import requests
from bs4 import BeautifulSoup
url = ('http://www.link_you_want_to_make_request_on.com/bla_bla')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Google some other variants of user-agents, make a list from them and change them from time to time in next requests. Watch out though which user-agents you use - some of them indicate mobile or tablet devices, and for them a site (not only IMDB) can give response pages in a format that differs from PC one - other markup, other design etc. So in general above algorithm works only for PC version of pages.
I have written a code in python using Beautiful Soup for extracting user name and their rating from IMDB. But there are many user who did not gave rating for their reviews. Its become difficult to map exactly ratings with their reviews. So how can i do this part?
http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt
In this url reviews are not assign rating.
url1 ="http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
response = requests.get(url1, headers=headers)
page=response.content
soup=BeautifulSoup(page)
for k in soup.findAll('div',{"class":"load-more-data"}):
if k.name == 'span' and m['class'] == "rating-other-user-rating":
print blah()
else:
print blah 1()
This is the code to check whether rating part exist in review part or not but it did not returning any thing?
The information you're looking for (username, rating) is located in 'div.review-container' tags.
About the tags that have no rating, you can just ignore them.
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
print name, rating
The information that shows when you press the Load More button is loaded via XHR requests.
You'll find the all data you need in order to preform the request in a 'div.load-more-data' tag.
load_more = soup.find('div', class_='load-more-data')
url = 'http://www.imdb.com{}?paginationKey={}'.format(
load_more['data-ajaxurl'], load_more['data-key']
)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Just repeat the above process untill you have all the info.
import requests
from bs4 import BeautifulSoup
url = "http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
ajax_url = url.split('?')[0] + "/_ajax?paginationKey={}"
reviews = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
reviews.append([name, rating])
print name, rating
load_more = soup.find('div', class_='load-more-data')
if not load_more:
break
url = ajax_url.format(load_more['data-key'])
I suggest you should try to console the content from <div class="review-container" ... of every review. Then select the specific data you want to retrieve.