I want to extract IMDb movie IDs using python

I want to extract IMDb movie IDs using python - python

Here is my code:
So I wanted to extract all the bollywood movies, and the project requires, movie titles, cast, crew, IMDB id etc.... I am not able to get all the IMDb IDs with the error nonetype. When I used it on one page only it was working quite well, however, when I use it on multiple pages it shows an error. Please help
#importing the libraries needed
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
from random import randint
#declaring the list of empty variables, So that we can append the data overall
movie_name = []
year = []
time=[]
rating=[]
votes = []
description = []
director_s = []
starList= []
imdb_id = []
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=num_votes,desc&start=1&ref_=adv_nxt"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
name = store.h3.a.text
movie_name.append(name)
year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text
year.append(year_of_release)
runtime = store.p.find("span", class_ = 'runtime').text if store.p.find("span", class_ = 'runtime') else " "
time.append(runtime)
rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if store.find('div', class_ = "inline-block ratings-imdb-rating") else " "
rating.append(rate)
value = store.find_all('span', attrs = {'name': "nv"})
vote = value[0].text if store.find_all('span', attrs = {'name': "nv"}) else " "
votes.append(vote)
# Description of the Movies
describe = store.find_all('p', class_ = 'text-muted')
description_ = describe[1].text.replace('\n', '') if len(describe) > 1 else ' '
description.append(description_)
## Director
ps = store.find_all('p')
for p in ps:
if 'Director'in p.text:
director =p.find('a').text
director_s.append(director)
## ID
imdbID = store.find('span','rating-cancel').a['href'].split('/')[2]
imdb_id.append(imdbID)
## actors
star = store.find("p", attrs={"class":""}).text.replace("Stars:", "").replace("\n", "").replace("Director:", "").strip()
starList.append(star)
Error:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17576/2711511120.py in <module>
63
64 ## IDs
---> 65 imdbID = store.find('span','rating-cancel').a['href'].split('/')[2] if store.find('span','rating-cancel').a['href'].split('/')[2] else ' '
66 imdb_id.append(imdbID)
67
AttributeError: 'NoneType' object has no attribute 'a'

Change your condition to the following, cause first you have to check if <span> exists:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
Example
Check the url, here are some of the <span> missing:
import requests
from bs4 import BeautifulSoup
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=my_ratings,desc"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.find_all('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
print(imdbID)
Output
tt9900050
tt9896506
tt9861220
tt9810436
tt9766310
tt9766294
tt9725058
tt9700334
tt9680166
tt9602804
Even better scrape the id via image tag cause these is always there even if there is only the placholder:
imdbID = store.img.get('data-tconst')

Related

Filter strings scraped from input form in Python

How do I filter out certain skills like 'django' and 'Django' from a collection of skills provided by users through an input form using a Python function?
I've requests and bs4 to get the raw data, but I need to filter through the results. Here's my code so far:
from bs4 import BeautifulSoup
import requests
import time
unfamiliar_skills = list(map(str,input('>')))
def find_jobs():
html_text = requests.get('https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python&txtLocation=').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')
# we first created the parsing for one output, then used for loop to parse multiple instances of that.
for index, job in enumerate(jobs):
published_date = job.find('span', class_ = 'sim-posted').span.text # must be b 1st to prevent scraping if the pub date is not == few days ago
if 'few' in published_date:
company_name = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ','')
skills = job.find('span', class_ = 'srp-skills').text.replace(' ','')
more_info = job.header.h2.a['href'] # like in a dictionary
if filter(unfamiliar_skills, skills):
with open(f'C:/Users/USER/{index}.txt', 'w') as f:
f.write(f'Company Name: {company_name.strip()} \n')
f.write(f'Required Skills: {skills.strip()} \n')
f.write(f'more_info: {more_info} \n')
print(f'File saved: {index}')
if __name__ == '__main__':
while True:
find_jobs()
time_wait = 10
print(f'Waiting {time_wait} minutes...')
time.sleep(time_wait*60)
Here is the printed output of skills variable:
rest,python,database,django,debugging,mongodb
python,webtechnologies,linux,mobile,mysql,angularjs,javascript
rest,python,security,debugging
python,docker,messaging,pythonscripting
python,git,django
python,database,django,mysql,api
python,hadoop,machinelearning
rest,python,django,git
python,django,,framework
python,java,scala
python,linux,windows,sql
python,webdeveloper,webservices
rest,python,database,django,api
Python,Django,Flask
python,django,javascript,webprogramming
python,Django,ObjectRelationalMapper
python,webtechnologies,webtechnologies
python,django,html5,javascript
python,django,html5,javascript
None

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File

You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.

scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

How do I extract text from bs4 tag elements in my code? Using contents function doesn't work

Getting below error on calling text.strip():
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-278-135ac185ec3f> in <module>
20 if isinstance(b, Tag):
21
---> 22 location = [a.text.strip() for a in b.find('span', attrs = {'class': 'location'})]
23 job_title = [a.text.strip() for a in b.find('a', attrs = {'data-tn-element':'jobTitle'})]
24
TypeError: 'NoneType' object is not iterable
Please see below for my code:
import requests
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
import pandas as pd
df = pd.DataFrame(columns=["location", 'company', 'job_title', 'salary'])
for start in range(1,100,10):
url = 'https://www.indeed.com/jobs?q=python+sql&l=San+Francisco&start={}'
#format url above to request the various search pages
new_url = url.format(start)
#conducting a request of the stated URL above:
page = requests.get(new_url)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
#loop through the tag elements
for b in soup.find_all(name = 'div', attrs={'class':'jobsearch-SerpJobCard'}):
print(type(b))
if isinstance(b,NavigableString):
continue
if isinstance(b, Tag):
location = [a.text.strip() for a in b.find('span', attrs = {'class': 'location'})]
job_title = [a.text.strip() for a in b.find('a', attrs = {'data-tn-element':'jobTitle'})]
try:
company = [a.text.strip() for a in b.find('span', attrs = {'class':'company'})]
except:
company = 'NA'
try:
salary = [a.text.strip() for a in b.find('span', attrs = {'class' : 'salaryText'}).find('nobr')]
except:
salary = 'NA'
df = df.append({"location":location,"company":company, "job_title": job_title, "salary": salary}, ignore_index=True)

It isn't found because there is no on the page with the class attribute set to 'location'. There are 's that have a class attribute set to 'location'.
Here is my modified version, still isn't perfect as some locations aren't grabbed. An idea is to just skip the ones that don't have a job or location if those two params are necessary. You could accomplish this by replacing the except action from assigning 'NA' to continue
import requests
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
import pandas as pd
df = pd.DataFrame(columns=["location", 'company', 'job_title', 'salary'])
for start in range(1,100,10):
url = 'https://www.indeed.com/jobs?q=python+sql&l=San+Francisco&start={}'
#format url above to request the various search pages
new_url = url.format(start)
#conducting a request of the stated URL above:
page = requests.get(new_url)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
#loop through the tag elements
for b in soup.find_all(name = 'div', attrs={'class':'jobsearch-SerpJobCard'}):
print(type(b))
if isinstance(b,NavigableString):
continue
if isinstance(b, Tag):
try:
location = [a.strip() for a in b.find('div', attrs = {'class': 'location'})]
except TypeError:
location = 'NA'
try:
job_title = [a.strip() for a in b.find('a', attrs = {'data-tn-element':'jobTitle'})]
except TypeError:
job_title = 'NA'
try:
company = [a.text.strip() for a in b.find('span', attrs = {'class':'company'})]
except:
company = 'NA'
try:
salary = [a.text.strip() for a in b.find('span', attrs = {'class' : 'salaryText'}).find('nobr')]
except:
salary = 'NA'
df = df.append({"location":location,"company":company, "job_title": job_title, "salary": salary}, ignore_index=True)

You will need to add check for None values, find return's None if no elements found.
location = [a.text.strip()
for a in b.find('span', attrs = {'class': 'location'})
if a]

How to extract text within h4 strong?

I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)

Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']

Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

I want to extract IMDb movie IDs using python - python

Related

Filter strings scraped from input form in Python

Extracting URL From Span Element Without href

how to scrape texts from voetsmart via beautifulsoup

How do I extract text from bs4 tag elements in my code? Using contents function doesn't work

How to extract text within h4 strong?

Categories

Resources