To add values to items that do not exist when web crawling

To add values to items that do not exist when web crawling - python

I am extracting review data from IMDB.
However, sometimes there is data without rank.
I want to treat Rank as 0 for such data and add it to an array.
I'm not sure how.
Can you help me?
Thank you very much!
web image
When extracted like this, the Rank value is lower.
for star in soup.select('span:has(~ .point-scale)'):
Star.append(star.text.strip());
for title in soup.find_all('a', {'class' : 'title'}):
Title.append(title.text.strip())
for content in soup.find_all(True,{'class' :[text show-more__control'
,'text show-more__control clickable]}):
Content.append(content.text.strip())
print(range(len(Content)))
len(list : rank, title, content)
How elements in a site fit into a list.

Not all reviews will have ratings so you need to take this into account:
$ python3 test.py https://www.imdb.com/title/tt5113040/reviews
Got response: 200
Title: The Secret Life of Pets 2
# (8/10) Not as bad as some reviews on here
Let's get this straight it a film made for childre...
-----
ddriver385, 26 May 2019
# (7/10) A Good Film for the kids
This film is a good film to watch with the kids. C...
-----
xxharriet_hobbsxx, 27 May 2019
# (7/10) Worth a watch
Admittedly, it probably wasn't necessary to follow...
-----
MythoGenesis, 24 May 2019
# (No rating) Intense and entertaining
Narratively, the film is not without fault. In par...
-----
TheBigSick, 26 May 2019
...
test.py
import requests
import sys
import time
from bs4 import BeautifulSoup
def fetch(url):
with requests.Session() as s:
r = s.get(url, timeout=5)
return r
def main(url):
start_t = time.time()
resp = fetch(url)
print(f'Got response: {resp.status_code}')
html = resp.content
bs = BeautifulSoup(html, 'html.parser')
title = bs.find('h3', attrs={'itemprop': 'name'})
print(f'Title: {title.a.text}')
reviews = bs.find_all('div', class_='review-container')
for review in reviews:
title = review.find('a', class_='title').text.strip()
rating = review.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span'))
rating = rating if rating else 'No rating'
user = review.find('span', class_='display-name-link').text
date = review.find('span', class_='review-date').text
content = review.find('div', class_='content').div.text
print(
f'# ({rating}) {title}\n'
f'{content[:50]}...\n'
f'{"-" * 5}\n'
f'{user}, {date}\n'
)
end_t = time.time()
elapsed_t = end_t - start_t
r_time = resp.elapsed.total_seconds()
print(f'Total: {elapsed_t:.2f}s, request: {r_time:.2f}s')
if __name__ == '__main__':
if len(sys.argv) > 1:
url = sys.argv[1]
main(url)
else:
print('URL is required.')
sys.exit(1)

Related

Filter strings scraped from input form in Python

How do I filter out certain skills like 'django' and 'Django' from a collection of skills provided by users through an input form using a Python function?
I've requests and bs4 to get the raw data, but I need to filter through the results. Here's my code so far:
from bs4 import BeautifulSoup
import requests
import time
unfamiliar_skills = list(map(str,input('>')))
def find_jobs():
html_text = requests.get('https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation=').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
# we first created the parsing for one output, then used for loop to parse multiple instances of that.
for index, job in enumerate(jobs):
published_date = job.find('span', class_ = 'sim-posted').span.text # must be b 1st to prevent scraping if the pub date is not == few days ago
if 'few' in published_date:
company_name = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ','')
skills = job.find('span', class_ = 'srp-skills').text.replace(' ','')
more_info = job.header.h2.a['href'] # like in a dictionary
if filter(unfamiliar_skills, skills):
with open(f'C:/Users/USER/{index}.txt', 'w') as f:
f.write(f'Company Name: {company_name.strip()} \n')
f.write(f'Required Skills: {skills.strip()} \n')
f.write(f'more_info: {more_info} \n')
print(f'File saved: {index}')
if __name__ == '__main__':
while True:
find_jobs()
time_wait = 10
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait*60)
Here is the printed output of skills variable:
rest,python,database,django,debugging,mongodb
python,webtechnologies,linux,mobile,mysql,angularjs,javascript
rest,python,security,debugging
python,docker,messaging,pythonscripting
python,git,django
python,database,django,mysql,api
python,hadoop,machinelearning
rest,python,django,git
python,django,,framework
python,java,scala
python,linux,windows,sql
python,webdeveloper,webservices
rest,python,database,django,api
Python,Django,Flask
python,django,javascript,webprogramming
python,Django,ObjectRelationalMapper
python,webtechnologies,webtechnologies
python,django,html5,javascript
python,django,html5,javascript
None

Issue With Web Scraping In Python

So, for some reason when I try and get the results for this script, it just crashes and shows no error at all before I get anything, someone please help me to get this to work. I don't know why this is, I think it may have to do with getting the Items Variable in some regard, but I just can't figure it out! Any help would be appreciated.
Here Is The Script:
from bs4 import BeautifulSoup
import requests
import re
import time
print("Computer Deal Finder")
print("\nBy: ViridianTelamon.")
print("\nThis Program Will Help You Find The Best Computers, Adapters, Electronics, And Computer Components Using The Website New Egg.")
item_thing = input("\nEnter The Item You Want To Find The Best Deals On: ")
time.sleep(2)
#url = f"https://www.amazon.com/s?k={item}&page=1&crid=1BE844NMMQSV7&sprefix={item}%2Caps%2C1923&ref=nb_sb_noss_1"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
#page_text = doc.find(class_="s-pagination-item s-pagination-selected")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131page={page}"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
items = doc.find_all(text=re.compile(item_thing))
#items = div.find_all(text=re.compile(item_thing))
for item in items:
parent = item.parent
link = None
if parent.name != "a":
continue
link = parent['href']
next_parent = item.find_parent(class_="item-container")
try:
price = next_parent.find(class_="price-current").find("strong").string
items_found[item] = {"Price: ": int(price.replace(",", "")), "URL: ": link}
except:
pass
#sorted_items = sorted(items_found.items(), key=lambda x: x[1]['price'])
sorted_items = sorted(items_found, key=lambda x: x[1]['price'])
print("\n--------------------")
for item in sorted_items:
print("\n"f"Name: {item[0]}")
print("\n"f"Price: ${items[1]['price']}")
print("\n"f"URL: items[1]['link']")
print("\n--------------------")
time.sleep(0.2)

I suggest you test the result of your .find() calls as not all items contain the information you need. For example:
from bs4 import BeautifulSoup
import requests
import re
import time
item_thing = "adapter"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
print(f"Getting page {page}")
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131&page={page}"
req = requests.get(url)
doc = BeautifulSoup(req.content, "html.parser")
for div in doc.find_all('div', class_="item-container"):
li_price = div.find(class_='price-current')
price = 0 # assume unknown price
if li_price:
strong = li_price.find('strong')
if strong:
price = float(strong.text.replace(',', ''))
a_tag = div.find('a', class_='item-title', href=True)
items_found.append([price, a_tag['href'], a_tag.text])
for price, link, name in sorted(items_found):
print(f"Name: {name}")
print(f"Price: ${price}")
print(f"URL: {link}")
print("--------------------")
This would give you results starting:
Name: axGear Universal Brass 3.5mm Male to 6.5mm Female Stereo Audio Adapter Jack Connector
Price: $3.0
URL: https://www.newegg.ca/p/231-0099-00023?Description=adapter&cm_re=adapter-_-9SIAD1NC9E3870-_-Product
--------------------
Name: axGear USB-C Female to USB 3.0 Male Adapter Converter Type C to USB 3 F/M
Price: $7.0
URL: https://www.newegg.ca/p/231-0099-00018?Description=adapter&cm_re=adapter-_-9SIAD1NB4E4533-_-Product
--------------------
Name: ORICO USB to Bluetooth 4.0 Portable Adapter Wireless Receiver Adapter Dongle -White
Price: $8.0
URL: https://www.newegg.ca/orico-bta-403/p/0XM-000H-00009?Description=adapter&cm_re=adapter-_-0XM-000H-00009-_-Product
--------------------

api to access imdb top rated movies

the script that I have here finds the rating of a movie from parsed HTML. What if I wanted to say find a top 50 list by genre and get the output in JSON format (movie: rating), how do I get this info in python? use API or some other method?
import requests
from bs4 import BeautifulSoup
URL = "google.com/search?q={}"
def find_rating(name):
ratings = {}
r = requests.get(URL.format(name))
s = BeautifulSoup(r.text, "html.parser")
n = s.find_all("div", class_ = "sDYTm")
for i in n:
d = i.text.split(".")
ratings[d[1]] = d[0]
return ratings
if __name__ == "__main__":
movie ="Good WIll Hunting"
rating = find_rating(movie)
print(rating)

How to webscrape reviews from external links with bs4?

I would like to extract for each movie at least 20 user reviews, but I don't know how to loop to get into the IMDb title movie and then to the user reviews with beautifulsoup.
start link = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250";
title_link(1) = "https://www.imdb.com/title/tt7131622/?ref_=adv_li_tt";
user_reviews_link_movie1 = "https://www.imdb.com/title/tt7131622/reviews?ref_=tt_ov_rt" ;
I am able to extract from a static page titles, years, ratings and metascores of each movie of the list.
# Import packages and set urls
from requests import get
url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
response = get(url)
print(response.text[:500])
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# The name
name = container.h3.a.text
names.append(name)
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years.append(year)
# The IMDB rating
imdb = float(container.strong.text)
imdb_ratings.append(imdb)
# The Metascore
m_score = container.find('span', class_ = 'metascore').text
metascores.append(int(m_score))
import pandas as pd
test_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores})
test_df
Actual results :
movie year imdb metascore
Once Upon a Time... in Hollywood (2019) (8.1) (83)
Scary Stories (2019) (6.5) (61)
Fast & Furious: Hobbs & Shaw (2019) (6.8) (60)
Avengers: Endgame (2019) (8.6) (78)
Expected :
movie1 year1 imbd1 metascore1 review1
movie1 year1 imbd1 metascore1 review2
...
movie1 year1 imbd1 metascore1 review20
movie2 year2 imbd2 metascore2 review1
...
movie2 year2 imbd2 metascore2 review20
...
movie250 year250 imbd250 metascore250 review20

Assuming that answer on my question in comments is "yes".
Below is a solution to your initial request.
There's a check whether a particular film really has 20 reviews. If less, then gather all available ones.
Technically parsing process is correct, I checked it when assigned movie_containers = movie_containers[:3]. Gathering all data will take some time.
UPDATE: just finished collecting info on all 250 films - everything is scraped without errors, so block after solution itself is just FYI.
Also if you want to go further with your parsing, I mean collect data for next 250 films and so on, you can add one more looping level to this parser. The process is similar to one in the "Reviews extracting" section.
# Import packages and set urls
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
base_url = 'https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2018-01-01,2019-12-31&count=250'
url_header_for_reviews = 'https://www.imdb.com'
url_tail_for_reviews = 'reviews?ref_=tt_urv'
base_response = get(base_url)
html_soup = BeautifulSoup(base_response.text, 'html.parser')
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
result_df = pd.DataFrame()
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
if container.find('div', class_ = 'ratings-metascore') is not None:
# Reviews extracting
num_reviews = 20
# Getting last piece of link puzzle for a movie reviews` link
url_middle_for_reviews = container.find('a')['href']
# Opening reviews page of a concrete movie
response_reviews = get(url_header_for_reviews + url_middle_for_reviews + url_tail_for_reviews)
reviews_soup = BeautifulSoup(response_reviews.text, 'html.parser')
# Searching all reviews
reviews_containers = reviews_soup.find_all('div', class_ = 'imdb-user-review')
# Check if actual number of reviews is less than target one
if len(reviews_containers) < num_reviews:
num_reviews = len(reviews_containers)
# Looping through each review and extracting title and body
reviews_titles = []
reviews_bodies = []
for review_index in range(num_reviews):
review_container = reviews_containers[review_index]
review_title = review_container.find('a', class_ = 'title').text.strip()
review_body = review_container.find('div', class_ = 'text').text.strip()
reviews_titles.append(review_title)
reviews_bodies.append(review_body)
# The name
name = container.h3.a.text
names = [name for i in range(num_reviews)]
# The year
year = container.h3.find('span', class_ = 'lister-item-year').text
years = [year for i in range(num_reviews)]
# The IMDB rating
imdb_rating = float(container.strong.text)
imdb_ratings = [imdb_rating for i in range(num_reviews)]
# The Metascore
metascore = container.find('span', class_ = 'metascore').text
metascores = [metascore for i in range(num_reviews)]
# Gathering up scraped data into result_df
if result_df.empty:
result_df = pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies})
elif num_reviews > 0:
result_df = result_df.append(pd.DataFrame({'movie': names,'year': years,'imdb': imdb_ratings,'metascore': metascores,'review_title': reviews_titles,'review_body': reviews_bodies}))
Btw I'm not sure that IMDB will let you gather data for all films in a loop as is. There's a possibility that you can get a captcha or redirection to some other page. If these issue appears,I'd go with a simple solution - pauses in scraping and/or changing user-agents.
Pause (sleep) can be implemented as follows:
import time
import numpy as np
time.sleep((30-5)*np.random.random()+5) #from 5 to 30 seconds
Inserting a user-agent in request can be done as follows:
import requests
from bs4 import BeautifulSoup
url = ('http://www.link_you_want_to_make_request_on.com/bla_bla')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
Google some other variants of user-agents, make a list from them and change them from time to time in next requests. Watch out though which user-agents you use - some of them indicate mobile or tablet devices, and for them a site (not only IMDB) can give response pages in a format that differs from PC one - other markup, other design etc. So in general above algorithm works only for PC version of pages.

How to extract a span tag inside div another tag

I have written a code in python using Beautiful Soup for extracting user name and their rating from IMDB. But there are many user who did not gave rating for their reviews. Its become difficult to map exactly ratings with their reviews. So how can i do this part?
http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt
In this url reviews are not assign rating.
url1 ="http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
response = requests.get(url1, headers=headers)
page=response.content
soup=BeautifulSoup(page)
for k in soup.findAll('div',{"class":"load-more-data"}):
if k.name == 'span' and m['class'] == "rating-other-user-rating":
print blah()
else:
print blah 1()
This is the code to check whether rating part exist in review part or not but it did not returning any thing?

The information you're looking for (username, rating) is located in 'div.review-container' tags.
About the tags that have no rating, you can just ignore them.
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
print name, rating
The information that shows when you press the Load More button is loaded via XHR requests.
You'll find the all data you need in order to preform the request in a 'div.load-more-data' tag.
load_more = soup.find('div', class_='load-more-data')
url = 'http://www.imdb.com{}?paginationKey={}'.format(
load_more['data-ajaxurl'], load_more['data-key']
)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Just repeat the above process untill you have all the info.
import requests
from bs4 import BeautifulSoup
url = "http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
ajax_url = url.split('?')[0] + "/_ajax?paginationKey={}"
reviews = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
reviews.append([name, rating])
print name, rating
load_more = soup.find('div', class_='load-more-data')
if not load_more:
break
url = ajax_url.format(load_more['data-key'])

I suggest you should try to console the content from <div class="review-container" ... of every review. Then select the specific data you want to retrieve.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

To add values to items that do not exist when web crawling - python

Related

Filter strings scraped from input form in Python

Issue With Web Scraping In Python

api to access imdb top rated movies

How to webscrape reviews from external links with bs4?

How to extract a span tag inside div another tag

Categories

Resources