Get movie information from IMDb API website - python

I used scrapy spiders to crawl the IMDb ID from the IMDb website already.
So now, I am going to use the IMDb API website & the IMDb ID I have collected to build a dictionary and save it into a json file.
import requests
import json
def query_url(id):
#query_url = 'http://www.omdbapi.com/?i='+id+'&plot=short&r=json'
return query_url
def get_movie_ids(input_file):
#id_list= []
#with open (input_file, 'r') as f:
#for line in f:
#id_list.append(line.strip()) # sth like ['tt0407887', 'tt1212123', ... ]
return id_list
def get_all_data(in_file, out_file):
movie_data_dict = {}
movie_ids = get_movie_ids(in_file)
id_counter = 0
session = requests.Session()
for id in movie_ids:
url = query_url(id)
#try:
#movie_data = session.get(url).json() # to catch corrupted json file
#except ValueError:
#pass
movie_data_dict[id_counter] = movie_data
id_counter += 1
with open(out_file, 'w+') as f:
json.dump(movie_data_dict, f)
if __name__ == '__main__':
movie_id_file = r'../IMDbIDCrawler/movie_id10-15' # the IMDb ID crawled by Scrapy
movie_data_file = 'IMDb2010-2015.json'
get_all_data(movie_id_file, movie_data_file)
When i run the code in command prompt, the code simply wouldn't run.
I dont know what I have done wrong in the above script..
This is a Python homework. So the codes marked by # is written by me while the other codes are provided in the first place. SO this is the thing......the pace of the course is very very fast and I can't catch up the course.And I have no one to ask to. So, please forgive me if I asked some really stupid / beginner questions...........

Related

How do I download videos from Pexels API?

I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)
sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.
from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()
I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)
You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.

How do I isolate a .json file?

I was trying to split some parts of a .json, to completely isolate parts of a .json file from an API I found.
This is trying to isolate the open share price of any stocks on the internet. I've consulted with Stack Overflow, but I think I may have made a mistake in my paraphrasing.
# example
import sys
import requests
import json
from ticker import *
def main():
stock_ticker = input("Name the stock ticker?\n")
time2 = int(input("How many minutes do you want to view history?\n"))
#separate file to generate URL for API
url = webpage(stock_ticker, time2)
response = requests.get(url)
assert response.status_code == 200
data = json.loads(response.text)
open_share_price = data["Time Series (5min)"]["2019-11-01 16:00:00"]["1. open"]
print(open_share_price)
return 0
if __name__ == "__main__":
sys.exit(main())
Returns
136.800
I've been wanting to get open share prices from different time frames, not just 16 :00:00, and not just at 5 min intervals.
I'm not great at programming, so any help would be gratefully received. Sorry in advance for my conciseness errors
Edit: The link for the data. Sorry I didn't include it the first time around. https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=kmb&interval=5min&apikey=exampleapikey
If you have to more than one element then you should use for-loop
import requests
url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=kmb&interval=5min&apikey=exampleapikey'
response = requests.get(url)
data = response.json()
for key, val in data["Time Series (5min)"].items():
print(key, val["1. open"])
If you want to keep it as JSON then create new directory to keep values and later save it in file.
import requests
import json
url = 'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=kmb&interval=5min&apikey=exampleapikey'
response = requests.get(url)
data = response.json()
new_data = dict()
for key, val in data["Time Series (5min)"].items():
new_data[key] = val["1. open"]
#print(new_data)
with open('new_data.json', 'w') as fp:
fp.write(json.dumps(new_data))

Improving the speed of a web scraper

I have created a web scraper to extract information of research papers that are published in a digital library (sample document).
Basically I'm extracting the title, abstract and list of references for each paper and storing them in text files. This process is repeated for all the referenced papers also.
I have used a queue to store the document IDs.
I need to extract such information from atleast 5000 papers, but the program is too slow and takes about 3 hours to go through 250-300 papers.
What are the possible ways of improving the speed of this scraper?
Here is the code:
# _*_ coding:utf-8 _*_
import urllib2
import json
import Queue
crawled = []
fo = open("paper.txt", "w")
class Paper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
self.title, self.abstract = self.fetch_data()
def fetch_data(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "abstract")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
title = data["title"]
abstract = data["abstract"]
return title, abstract
def fetch_ieee_references(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "references")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
references = []
try:
for ref in data["references"]:
try:
ref_link = ref["links"]["documentLink"]
ref_paper_id = ref_link.split("/")[-1]
references.append(Paper(ref_paper_id))
except:
pass
except:
pass
return references
def extract_paper(self):
try:
print "Paper ID"
print self.paper_id
fname = str(self.paper_id)
fname = fname + ".txt"
fcon = open(fname,"w")
print
print "Title"
print self.title
print >>fcon, self.title
print "Abstract"
print self.abstract
print >>fcon, self.abstract
print "References"
for ref in self.fetch_ieee_references():
print ref.paper_id, ref.title
print >>fo, self.paper_id, ref.paper_id
except:
pass
def new_func():
n_id = 6639344
q = Queue.Queue()
q.put_nowait(n_id)
crawled.append(n_id)
while not q.empty():
p_id = q.get_nowait()
paper = Paper(p_id)
paper.extract_paper()
for ref in paper.fetch_ieee_references():
if ref.paper_id not in crawled:
crawled.append(ref.paper_id)
q.put_nowait(ref.paper_id)
new_func()
As already mentioned by other users it mostly depends on the speed of the HTTP request so you are dependent on the server of the site. So to speed things up you can divide the papers between multiple processes.
Also I don't get why you read the html and then use json.loads you can just use json.load on the response, this will speed things up a little bit.

how to add lists to a dictionary then output to .csv

I'm try to iterate through tables in html by a searchlabel, then update the found value to a dictionary, then write those values to a csv. The output currently works for both the url and the headline, but the name output will either be blank or show "None." If i print the output of blog["name'] however, it is correctly pulling the information I want. I suspect that it's an indentation error but I can't figure out where to line things up. I've tried moving things around but nothing seems to work to get the name assignment to work inside that loop.
import os
from bs4 import BeautifulSoup
import my_csv_writer
def td_finder(tr, searchLabel):
value = ""
index = tr.text.find(searchLabel)
if index>-1:
tds = tr.findAll('td')
if len(tds)>1:
value = tds[1].text
return value
def main():
topdir = 'some_directory'
writer = my_csv_writer.CsvWriter("output.csv")
writer.writeLine(["url", "headline", "name"])
"""Main Function"""
blog = []
for root, dirs, files in os.walk(topdir):
for f in files:
url = os.path.join(root, f)
url = os.path.dirname(url).split('some_file')[1]
if f.lower().endswith((".html")):
file_new = open(os.path.join(root, f), "r").read()
soup = BeautifulSoup(file_new)
blog = {}
#Blog Title
blog["title"] = soup.find('title').text
for table in soup.findAll("table"):
for tr in table.findAll("tr"):
#name
blog["name"] = td_finder(tr, "name:")
seq = [url, unicode(blog["title"]), unicode(blog.get("name"))]
writer.writeLine(seq)
#return ""
if __name__ == '__main__':
main()
print "Finished main"
You're writing unicode strings to a csv file which according to the official docs "The csv module doesn’t directly support reading and writing Unicode...".
It does offer alternative classes to enable different encodings via UnicodeWriter. The following answer from Boud on SO highlights the need to set the desired encoding in the CSV file.

Can't parse XML effectively using Python

import urllib
import xml.etree.ElementTree as ET
def getWeather(city):
#create google weather api url
url = "http://www.google.com/ig/api?weather=" + urllib.quote(city)
try:
# open google weather api url
f = urllib.urlopen(url)
except:
# if there was an error opening the url, return
return "Error opening url"
# read contents to a string
s = f.read()
tree=ET.parse(s)
current= tree.find("current_condition/condition")
condition_data = current.get("data")
weather = condition_data
if weather == "<?xml version=":
return "Invalid city"
#return the weather condition
#return weather
def main():
while True:
city = raw_input("Give me a city: ")
weather = getWeather(city)
print(weather)
if __name__ == "__main__":
main()
gives error , I actually wanted to find values from google weather xml site tags
Instead of
tree=ET.parse(s)
try
tree=ET.fromstring(s)
Also, your path to the data you want is incorrect. It should be: weather/current_conditions/condition
This should work:
import urllib
import xml.etree.ElementTree as ET
def getWeather(city):
#create google weather api url
url = "http://www.google.com/ig/api?weather=" + urllib.quote(city)
try:
# open google weather api url
f = urllib.urlopen(url)
except:
# if there was an error opening the url, return
return "Error opening url"
# read contents to a string
s = f.read()
tree=ET.fromstring(s)
current= tree.find("weather/current_conditions/condition")
condition_data = current.get("data")
weather = condition_data
if weather == "<?xml version=":
return "Invalid city"
#return the weather condition
return weather
def main():
while True:
city = raw_input("Give me a city: ")
weather = getWeather(city)
print(weather)
I'll give the same answer here I did in my comment on your previous question. In the future, kindly update the existing question instead of posting a new one.
Original
I'm sorry - I didn't mean that my code would work exactly as you desired. Your error is because s is a string and parse takes a file or file-like object. So, "tree = ET.parse(f)" may work better. I would suggest reading up on the ElementTree api so you understand what the functions I've used above do in practice. Hope that helps, and let me know if it works.

Categories