Improving the speed of a web scraper

Improving the speed of a web scraper - python

I have created a web scraper to extract information of research papers that are published in a digital library (sample document).
Basically I'm extracting the title, abstract and list of references for each paper and storing them in text files. This process is repeated for all the referenced papers also.
I have used a queue to store the document IDs.
I need to extract such information from atleast 5000 papers, but the program is too slow and takes about 3 hours to go through 250-300 papers.
What are the possible ways of improving the speed of this scraper?
Here is the code:
# _*_ coding:utf-8 _*_
import urllib2
import json
import Queue
crawled = []
fo = open("paper.txt", "w")
class Paper(object):
def __init__(self, paper_id):
self.paper_id = paper_id
self.title, self.abstract = self.fetch_data()
def fetch_data(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "abstract")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
title = data["title"]
abstract = data["abstract"]
return title, abstract
def fetch_ieee_references(self):
base_url = "http://ieeexplore.ieee.org/rest/document/{0}/{1}"
data_url = base_url.format(self.paper_id, "references")
response = urllib2.urlopen(data_url)
html = response.readlines()
data = json.loads("\n".join(html))
references = []
try:
for ref in data["references"]:
try:
ref_link = ref["links"]["documentLink"]
ref_paper_id = ref_link.split("/")[-1]
references.append(Paper(ref_paper_id))
except:
pass
except:
pass
return references
def extract_paper(self):
try:
print "Paper ID"
print self.paper_id
fname = str(self.paper_id)
fname = fname + ".txt"
fcon = open(fname,"w")
print
print "Title"
print self.title
print >>fcon, self.title
print "Abstract"
print self.abstract
print >>fcon, self.abstract
print "References"
for ref in self.fetch_ieee_references():
print ref.paper_id, ref.title
print >>fo, self.paper_id, ref.paper_id
except:
pass
def new_func():
n_id = 6639344
q = Queue.Queue()
q.put_nowait(n_id)
crawled.append(n_id)
while not q.empty():
p_id = q.get_nowait()
paper = Paper(p_id)
paper.extract_paper()
for ref in paper.fetch_ieee_references():
if ref.paper_id not in crawled:
crawled.append(ref.paper_id)
q.put_nowait(ref.paper_id)
new_func()

As already mentioned by other users it mostly depends on the speed of the HTTP request so you are dependent on the server of the site. So to speed things up you can divide the papers between multiple processes.
Also I don't get why you read the html and then use json.loads you can just use json.load on the response, this will speed things up a little bit.

Related

How do I download videos from Pexels API?

I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)

sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.

from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()

I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)

You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.

Get movie information from IMDb API website

I used scrapy spiders to crawl the IMDb ID from the IMDb website already.
So now, I am going to use the IMDb API website & the IMDb ID I have collected to build a dictionary and save it into a json file.
import requests
import json
def query_url(id):
#query_url = 'http://www.omdbapi.com/?i='+id+'&plot=short&r=json'
return query_url
def get_movie_ids(input_file):
#id_list= []
#with open (input_file, 'r') as f:
#for line in f:
#id_list.append(line.strip()) # sth like ['tt0407887', 'tt1212123', ... ]
return id_list
def get_all_data(in_file, out_file):
movie_data_dict = {}
movie_ids = get_movie_ids(in_file)
id_counter = 0
session = requests.Session()
for id in movie_ids:
url = query_url(id)
#try:
#movie_data = session.get(url).json() # to catch corrupted json file
#except ValueError:
#pass
movie_data_dict[id_counter] = movie_data
id_counter += 1
with open(out_file, 'w+') as f:
json.dump(movie_data_dict, f)
if __name__ == '__main__':
movie_id_file = r'../IMDbIDCrawler/movie_id10-15' # the IMDb ID crawled by Scrapy
movie_data_file = 'IMDb2010-2015.json'
get_all_data(movie_id_file, movie_data_file)
When i run the code in command prompt, the code simply wouldn't run.
I dont know what I have done wrong in the above script..
This is a Python homework. So the codes marked by # is written by me while the other codes are provided in the first place. SO this is the thing......the pace of the course is very very fast and I can't catch up the course.And I have no one to ask to. So, please forgive me if I asked some really stupid / beginner questions...........

gevent pool getting stuck

I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (e.g. with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()

I see many mistakes here.
There is not used gevent.sleep() and not time.sleep which is
blocking.
Your variables names are too short. Your could add
descriptions on what each part of code is supposed to do. for example the variable 'p'
is used twice..
There are multiple urls gets using urlopen and the driver module? confusing..
I would use queues between different workers and have just one worker do
write_row calls and deal with the file access now you have multiple green lets accessing
the same file..
use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())'
code maybe there are more exceptions happening, which you now don't see.
more tips for gevent

I don't see why this code is not working! can someone please tell me what i am doing wrong?

I keep getting an error, but i dont see it..
I am new to programing and if you explane me the code, please dont assume I know too much.
#!/usr/bin/env python
# Name:
# Student number:
'''
This script crawls the IMDB top 250 movies.
'''
# Python standard library imports
import os
import sys
import csv
import codecs
import cStringIO
import errno
# Third party library imports:
import pattern
from pattern.web import URL, DOM
# --------------------------------------------------------------------------
# Constants:
TOP_250_URL = 'http://www.imdb.com/chart/top'
OUTPUT_CSV = 'top250movies.csv'
SCRIPT_DIR = os.path.split(os.path.realpath(__file__))[0]
BACKUP_DIR = os.path.join(SCRIPT_DIR, 'HTML_BACKUPS')
# --------------------------------------------------------------------------
# Unicode reading/writing functionality for the Python CSV module, taken
# from the Python.org csv module documentation (very slightly adapted).
# Source: http://docs.python.org/2/library/csv.html (retrieved 2014-03-09).
class UTF8Recoder(object):
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader(object):
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
# --------------------------------------------------------------------------
# Utility functions (no need to edit):
def create_dir(directory):
'''
Create directory if needed.
Args:
directory: string, path of directory to be made
Note: the backup directory is used to save the HTML of the pages you
crawl.
'''
try:
os.makedirs(directory)
except OSError as e:
if e.errno == errno.EEXIST:
# Backup directory already exists, no problem for this script,
# just ignore the exception and carry on.
pass
else:
# All errors other than an already exising backup directory
# are not handled, so the exception is re-raised and the
# script will crash here.
raise
def save_csv(filename, rows):
'''
Save CSV file with the top 250 most popular movies on IMDB.
Args:
filename: string filename for the CSV file
rows: list of rows to be saved (250 movies in this exercise)
'''
with open(filename, 'wb') as f:
writer = UnicodeWriter(f) # implicitly UTF-8
writer.writerow([
'title', 'runtime', 'genre(s)', 'director(s)', 'writer(s)',
'actor(s)', 'rating(s)', 'number of rating(s)'
])
writer.writerows(rows)
def make_backup(filename, html):
'''
Save HTML to file.
Args:
filename: absolute path of file to save
html: (unicode) string of the html file
'''
with open(filename, 'wb') as f:
f.write(html)
def main():
'''
Crawl the IMDB top 250 movies, save CSV with their information.
Note:
This function also makes backups of the HTML files in a sub-directory
called HTML_BACKUPS (those will be used in grading).
'''
# Create a directory to store copies of all the relevant HTML files (those
# will be used in testing).
print 'Setting up backup dir if needed ...'
create_dir(BACKUP_DIR)
# Make backup of the IMDB top 250 movies page
print 'Access top 250 page, making backup ...'
top_250_url = URL(TOP_250_URL)
top_250_html = top_250_url.download(cached=True)
make_backup(os.path.join(BACKUP_DIR, 'index.html'), top_250_html)
# extract the top 250 movies
print 'Scraping top 250 page ...'
url_strings = scrape_top_250(top_250_url)
# grab all relevant information from the 250 movie web pages
rows = []
for i, url in enumerate(url_strings): # Enumerate, a great Python trick!
print 'Scraping movie %d ...' % i
# Grab web page
movie_html = URL(url).download(cached=True)
# Extract relevant information for each movie
movie_dom = DOM(movie_html)
rows.append(scrape_movie_page(movie_dom))
# Save one of the IMDB's movie pages (for testing)
if i == 83:
html_file = os.path.join(BACKUP_DIR, 'movie-%03d.html' % i)
make_backup(html_file, movie_html)
# Save a CSV file with the relevant information for the top 250 movies.
print 'Saving CSV ...'
save_csv(os.path.join(SCRIPT_DIR, 'top250movies.csv'), rows)
This function below, should return the webpage links of the top 250 movies:
# --------------------------------------------------------------------------
# Functions to adapt or provide implementations for:
def scrape_top_250(url):
'''
Scrape the IMDB top 250 movies index page.
Args:
url: pattern.web.URL instance pointing to the top 250 index page
Returns:
A list of strings, where each string is the URL to a movie's page on
IMDB, note that these URLS must be absolute (i.e. include the http
part, the domain part and the path part).
'''
movie_urls = []
table_rows = dom.by_id('main').by_tag('table')[1].by_tag('tr')
for tr in table_rows[1:]:
a = tr.by_tag('a')[0]
movie_urls.append(clean_unicode(abs_url(a.attributes.get('href', ''), url.string)))
# YOUR SCRAPING CODE GOES HERE, ALL YOU ARE LOOKING FOR ARE THE ABSOLUTE
# URLS TO EACH MOVIE'S IMDB PAGE, ADD THOSE TO THE LIST movie_urls.
# return the list of URLs of each movie's page on IMDB
return movie_urls
#print scrape_top_250(url)
And finaly this function should return specific contents.
def scrape_movie_page(dom):
'''
Scrape the IMDB page for a single movie
Args:
dom: pattern.web.DOM instance representing the page of 1 single
movie.
Returns:
A list of strings representing the following (in order): title, year,
duration, genre(s) (semicolon separated if several), director(s)
(semicolon separated if several), writer(s) (semicolon separated if
several), actor(s) (semicolon separated if several), rating, number
of ratings.
'''
# YOUR SCRAPING CODE GOES HERE:
for p in movie_urls:
p_url = URL(p)
p_dom = DOM(p_url.download(cached=True))
title = clean_unicode(p_dom.by_class('header')[0].content)
title = plaintext(strip_between('<span', '</span>', title))
runtime = clean_unicode(p_dom.by_class('infobar')[0].by_tag('time')[0].content)
duration = runtime
genres = []
for genre in p_dom.by_class('infobar')[0].by_tag('a')[:-1]:
genres.append(clean_unicode(genre.content))
directors = []
writers = []
actors = []
text_blocks = p_dom.by_class('txt-block')[:3]
for t in text_blocks:
spans = t.by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'director':
director = s.by_tag('span')[0].by_tag('a')[0].content
directors.append(clean_unicode(director))
if s.attributes.get('itemprop') == 'writer':
p_writer = s.by_tag('span')[0].by_tag('a')[0].content
writers.append(clean_unicode(p_writer))
if s.attributes.get('itemprop') == 'actors':
actor = s.by_tag('span')[0].by_tag('a')[0].content
actors.append(clean_unicode(actor))
rating = []
ratings_count = []
spans = p_dom.by_class('star-box-details')[0].by_tag('span')
for s in spans:
if s.attributes.get('itemprop') == 'ratingValue':
rating = clean_unicode(s.content)
if s.attributes.get('itemprop') == 'ratingCount':
ratings_count = clean_unicode(s.content)
# format the strings from lists
genres = concat_strings(genres)
directors = concat_strings(directors)
writers = concat_strings(writers)
actors = concat_strings(actors)
# Return everything of interest for this movie (all strings as specified
# in the docstring of this function).
return title, duration, genres, directors, writers, actors, rating, \
n_ratings
if __name__ == '__main__':
main() # call into the progam
# If you want to test the functions you wrote, you can do that here:
# ...

It's just that (in the original revision) you forgot to indent the body of the function scrape_movie_page. The for loop is in module scope.

Most common reason for cause of this error due to not proper indent the body of the function, but some time code looks proper a for as indentation point of view but still it throw same error. I always saw this error comes due to mismatch in indentation.In same block if you use two type of indentation like in same block if for some line you use tab and and for some line you use spaces, code looks good as for as indentation prospective but it always through indentation error.

Can't parse XML effectively using Python

import urllib
import xml.etree.ElementTree as ET
def getWeather(city):
#create google weather api url
url = "http://www.google.com/ig/api?weather=" + urllib.quote(city)
try:
# open google weather api url
f = urllib.urlopen(url)
except:
# if there was an error opening the url, return
return "Error opening url"
# read contents to a string
s = f.read()
tree=ET.parse(s)
current= tree.find("current_condition/condition")
condition_data = current.get("data")
weather = condition_data
if weather == "<?xml version=":
return "Invalid city"
#return the weather condition
#return weather
def main():
while True:
city = raw_input("Give me a city: ")
weather = getWeather(city)
print(weather)
if __name__ == "__main__":
main()
gives error , I actually wanted to find values from google weather xml site tags

Instead of
tree=ET.parse(s)
try
tree=ET.fromstring(s)
Also, your path to the data you want is incorrect. It should be: weather/current_conditions/condition
This should work:
import urllib
import xml.etree.ElementTree as ET
def getWeather(city):
#create google weather api url
url = "http://www.google.com/ig/api?weather=" + urllib.quote(city)
try:
# open google weather api url
f = urllib.urlopen(url)
except:
# if there was an error opening the url, return
return "Error opening url"
# read contents to a string
s = f.read()
tree=ET.fromstring(s)
current= tree.find("weather/current_conditions/condition")
condition_data = current.get("data")
weather = condition_data
if weather == "<?xml version=":
return "Invalid city"
#return the weather condition
return weather
def main():
while True:
city = raw_input("Give me a city: ")
weather = getWeather(city)
print(weather)

I'll give the same answer here I did in my comment on your previous question. In the future, kindly update the existing question instead of posting a new one.
Original
I'm sorry - I didn't mean that my code would work exactly as you desired. Your error is because s is a string and parse takes a file or file-like object. So, "tree = ET.parse(f)" may work better. I would suggest reading up on the ElementTree api so you understand what the functions I've used above do in practice. Hope that helps, and let me know if it works.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Improving the speed of a web scraper - python

Related

How do I download videos from Pexels API?

Get movie information from IMDb API website

gevent pool getting stuck

I don't see why this code is not working! can someone please tell me what i am doing wrong?

Can't parse XML effectively using Python

Categories

Resources