How to scrape multiple pages using the apple i tunes api - python

I'm trying to scrape the Itunes API to get information for all of the podcasts available in the Apple iTunes store. Currently, I'm only able to pull 200 at a time. When I try to grab the next 200 podcasts in the list, I'm getting the same 200 as before.
https://itunes.apple.com/search?term=podcast&limit=2
https://itunes.apple.com/search?term=podcast&limit=2&offset=1
Any suggestions would be appreciated.
import requests
import pandas as pd
import time
import json
url = 'https://itunes.apple.com/search?term=podcast&limit=2'
res = requests.get(url,headers={'User-agent': 'project'})
res.status_code
current_url = None
posts = []
the_offset = 0
for _ in range(2):
if current_url == None:
current_url = url
else:
current_url = url +'&offset={}'.format(the_offset)
res = requests.get(current_url)
if res.status_code != 200:
print('Error',res.status_code)
break
the_offset += 1
current_dict = res.json()
current_posts = {k:v for (k,v) in current_dict.items()}
posts.extend(current_posts['results'])
print(current_url)
time.sleep(3)

Try changing the offset parameter:
results = 100
limit = 10
pages = int(results / limit)
for i in pages:
offset = i+1
request(offset)

Related

How can I count all the jobs listing on the other pages in Python?

I'm trying to make an API call and count all the job postings from all pages. The thing is I only managed to get the count on first page and having trouble to make another API call for the next page. just to mention that there are maximum of 50 jobs per page. Thanks in advance!
import requests
baseurl = "https://jobs.github.com/positions.json"
def get_number_of_jobs(technology):
number_of_jobs = 0
page_count = 0
tech = {'description': technology}
response = requests.get(baseurl, params=tech)
if response.ok:
jobs = response.json()
for job in jobs:
for elm in job:
if elm == 'id':
number_of_jobs += 1
if number_of_jobs > 49:
page_count += 1
tech = {'description': technology, 'page': page_count}
response = requests.get(baseurl, params=tech)
jobs = response.json()
number_of_jobs += 1
return technology,number_of_jobs
I think you are looking for something like this, you can use len() instead of looping through and counting the "id" attribute.
import requests
baseurl = "https://jobs.github.com/positions.json"
def get_number_of_jobs(technology):
job_count = 50
total_job_count = 0
page_count = 1
while job_count > 49:
job_count = 0
# send request to GitHub for the next page
tech = {'description': technology, 'page': page_count}
response = requests.get(baseurl, params=tech)
if response.ok:
jobs = response.json()
page_count += 1
job_count = len(jobs)
total_job_count += job_count
return technology,total_job_count
print(get_number_of_jobs("java"))
The first call you make is page 1, so you would be calling page 1 twice in your algorithm, you can start at 1 and if > 49 continue to loop through.
Python result:
Github job posting for Java:
Let me know if you need anything clarifying,
Thanks,

Web Scraping from an API Loop

I'm scraping from the World Bank for a paper and I'm trying to make a loop of the web scraping of different indicators but I can't seem to make it work until a certain part of the code. Hope someone can help please?
#Single Code for each indicator
indcator = 'SP.POP.TOTL?date=2000:2020'
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % indicator
response = requests.get(url)
print(response)
result = response.content
result = json.loads(result)
pop_total_df = pd.DataFrame.from_dict(result[1])
This is the loop i'm trying to build but I got an error in the last part of below code:
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(3):
result_json.append(json.loads(result_list[i])))
As you are making 2 requests (FP.CPI.TOTL.ZG?date=2000:2020 and SP.POP.TOTL?date=2000:2020) your result_list length is 2, so its index are 0 and 1. Use range(2) or range(len(result_list)) instead:
import requests, json
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(len(result_list)):
result_json.append(json.loads(result_list[i]))

I am trying to scrape data but it gets data only of 10 pages whereas there are 26 pages

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list")
c = r.content
soup = BeautifulSoup(c,"html.parser")
all = soup.find_all("div",{"class":"col _2-gKeQ"})
page_nr=soup.find_all("a",{"class":"_33m_Yg"})[-1].text
print(page_nr,"number of pages were found")
#all[0].find("div",{"class":"_1vC4OE _2rQ-NK"}).text
l=[]
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page=1&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(0,int(page_nr)*10,10):
print( )
r=requests.get(base_url+str(page)+".html")
c=r.content
#c=r.json()["list"]
soup=BeautifulSoup(c,"html.parser")
for item in all:
d ={}
#price
d["Price"] = item.find("div",{"class":"_1vC4OE _2rQ-NK"}).text
#Name
d["Name"] = item.find("div",{"class":"_3wU53n"}).text
for li in item.find_all("li",{"class":"_1ZRRx1"}):
if " EMI" in li.text:
d["EMI"] = li.text
else:
d["EMI"] = None
for li1 in item.find_all("li",{"class":"_1ZRRx1"}):
if "Special " in li1.text:
d["Special Price"] = li1.text
else:
d["Special Price"] = None
for val in item.find_all("li",{"class":"tVe95H"}):
if "Display" in val.text:
d["Display"] = val.text
elif "Warranty" in val.text:
d["Warrenty"] = val.text
elif "RAM" in val.text:
d["Ram"] = val.text
l.append(d)
import pandas
df = pandas.DataFrame(l)
This might work on standard pagination
i = 1
items_parsed = set()
loop = True
base_url = "https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page={}&q=laptop&sid=6bo%2Fb5g&viewType=list"
while True:
page = requests.get(base_url.format(i))
items = requests.get(#yourelements#)
if not items:
break
for item in items:
#Scrap your item and once you sucessfully done the scrap, return the url of the parsed item into url_parsed (details below code) for example:
url_parsed = your_stuff(items)
if url_parsed in items_parsed:
loop = False
items_parsed.add(url_parsed)
if not loop:
break
i += 1
I formatted your URL where ?page=X with base_url.format(i) so it can iterate until you have no items found on the page OR sometimes you return on page 1 when you reached max_page + 1.
If above the maximum page you get the items you already parsed on the first page you can declare a set() and put the URL of every items you parsed and then check if you already parsed them.
Note that this is just an idea.
Since the page number in the URL is almost in the middle I'd apply a similar change to your code:
base_url="https://www.flipkart.com/search?as=on&as-pos=1_1_ic_lapto&as-show=on&otracker=start&page="
end_url ="&q=laptop&sid=6bo%2Fb5g&viewType=list"
for page in range(1, page_nr + 1):
r=requests.get(base_url+str(page)+end_url+".html")
You have access to only first 10 pages from initial URL.
You can make a loop from "&page=1" to "&page=26".

Infinite Web Scraping Twitter

I'm trying to web scrape Twitter using Python 3.X but I only collect the last 20 tweets of my request.
I would like to collect whole data of a request between 2006 and now. For this I think to have create two more function: one which will collect the last tweets and one which will collect the current tweets?
And how can I collect the data from this scrolling page? I think that I have to use the tweet's id but no matter the request I do it's always the last 20 tweets that I get.
from pprint import pprint
from lxml import html
import requests
import datetime as dt
from BeautifulSoup import BeautifulSoup
def search_twitter(search):
url = "https://twitter.com/search?f=tweets&vertical=default&q="+search+"&src=typd&lang=fr"
request = requests.get(url)
sourceCode = BeautifulSoup(request.content, "lxml")
tweets = sourceCode.find_all('li', 'js-stream-item')
return tweets
def filter_tweets(tweets):
data = []
for tweet in tweets:
if tweet.find('p', 'tweet-text'):
dtwee = [
['id', tweet['data-item-id']],
['username', tweet.find('span', 'username').text],
['time', tweet.find('a', 'tweet-timestamp')['title']],
['tweet', tweet.find('p', 'tweet-text').text.encode('utf-8')]]
data.append(dtwee)
#tweet_time = dt.datetime.strptime(tweet_time, '%H:%M - %d %B %Y')
else:
continue
return data
def firstlastId_tweets(tweets):
firstID = ""
lastID = ""
i = 0
for tweet in tweets:
if(i == 0):
firstID = tweet[0][1]
elif(i == (len(tweets)-1)):
lastID = tweet[0][1]
i+=1
return firstID, lastID
def last_tweets(search, lastID):
url = "https://twitter.com/search?f=tweets&vertical=default&q="+search+"&src=typd&lang=fr&max_position=TWEET-"+lastID
request = requests.get(url)
sourceCode = BeautifulSoup(request.content, "lxml")
tweets = sourceCode.find_all('li', 'js-stream-item')
return tweets
tweets = search_twitter("lol")
tweets = filter_tweets(tweets)
pprint(tweets)
firstID, lastID = firstlastId_tweets(tweets)
print(firstID, lastID)
while True:
lastTweets = last_tweets("lol", lastID)
pprint(lastTweets)
firstID, lastID = firstlastId_tweets(lastTweets)
print(firstID, lastID)
I found a good solution based on this webpage:
http://ataspinar.com/2015/11/09/collecting-data-from-twitter/
What I did was creating a variable called max_pos where I stored this string:
'&max_position=TWEET-'+last_id+'-'+first_id
I stored the first_id (position1 Tweet id) and last_id (position20 Tweet id)
So for the request, I used something like this:
request = requests.get(url+max_pos) Starting with max_pos empty.
I see this can be a common issue, we could post a working solution. I still do not have it showing the results the way I need, but I could simulate the "scroll down till the end" following the guide from the link.

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Categories