Infinite Web Scraping Twitter - python

I'm trying to web scrape Twitter using Python 3.X but I only collect the last 20 tweets of my request.
I would like to collect whole data of a request between 2006 and now. For this I think to have create two more function: one which will collect the last tweets and one which will collect the current tweets?
And how can I collect the data from this scrolling page? I think that I have to use the tweet's id but no matter the request I do it's always the last 20 tweets that I get.
from pprint import pprint
from lxml import html
import requests
import datetime as dt
from BeautifulSoup import BeautifulSoup
def search_twitter(search):
url = "https://twitter.com/search?f=tweets&vertical=default&q="+search+"&src=typd&lang=fr"
request = requests.get(url)
sourceCode = BeautifulSoup(request.content, "lxml")
tweets = sourceCode.find_all('li', 'js-stream-item')
return tweets
def filter_tweets(tweets):
data = []
for tweet in tweets:
if tweet.find('p', 'tweet-text'):
dtwee = [
['id', tweet['data-item-id']],
['username', tweet.find('span', 'username').text],
['time', tweet.find('a', 'tweet-timestamp')['title']],
['tweet', tweet.find('p', 'tweet-text').text.encode('utf-8')]]
data.append(dtwee)
#tweet_time = dt.datetime.strptime(tweet_time, '%H:%M - %d %B %Y')
else:
continue
return data
def firstlastId_tweets(tweets):
firstID = ""
lastID = ""
i = 0
for tweet in tweets:
if(i == 0):
firstID = tweet[0][1]
elif(i == (len(tweets)-1)):
lastID = tweet[0][1]
i+=1
return firstID, lastID
def last_tweets(search, lastID):
url = "https://twitter.com/search?f=tweets&vertical=default&q="+search+"&src=typd&lang=fr&max_position=TWEET-"+lastID
request = requests.get(url)
sourceCode = BeautifulSoup(request.content, "lxml")
tweets = sourceCode.find_all('li', 'js-stream-item')
return tweets
tweets = search_twitter("lol")
tweets = filter_tweets(tweets)
pprint(tweets)
firstID, lastID = firstlastId_tweets(tweets)
print(firstID, lastID)
while True:
lastTweets = last_tweets("lol", lastID)
pprint(lastTweets)
firstID, lastID = firstlastId_tweets(lastTweets)
print(firstID, lastID)

I found a good solution based on this webpage:
http://ataspinar.com/2015/11/09/collecting-data-from-twitter/
What I did was creating a variable called max_pos where I stored this string:
'&max_position=TWEET-'+last_id+'-'+first_id
I stored the first_id (position1 Tweet id) and last_id (position20 Tweet id)
So for the request, I used something like this:
request = requests.get(url+max_pos) Starting with max_pos empty.
I see this can be a common issue, we could post a working solution. I still do not have it showing the results the way I need, but I could simulate the "scroll down till the end" following the guide from the link.

Related

Web Scraping from an API Loop

I'm scraping from the World Bank for a paper and I'm trying to make a loop of the web scraping of different indicators but I can't seem to make it work until a certain part of the code. Hope someone can help please?
#Single Code for each indicator
indcator = 'SP.POP.TOTL?date=2000:2020'
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % indicator
response = requests.get(url)
print(response)
result = response.content
result = json.loads(result)
pop_total_df = pd.DataFrame.from_dict(result[1])
This is the loop i'm trying to build but I got an error in the last part of below code:
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(3):
result_json.append(json.loads(result_list[i])))
As you are making 2 requests (FP.CPI.TOTL.ZG?date=2000:2020 and SP.POP.TOTL?date=2000:2020) your result_list length is 2, so its index are 0 and 1. Use range(2) or range(len(result_list)) instead:
import requests, json
#indicator list
indicator = {'FP.CPI.TOTL.ZG?date=2000:2020','SP.POP.TOTL?date=2000:2020'}
#list of urls with the indicators
url_list = []
for i in indicator:
url = "http://api.worldbank.org/v2/countries/all/indicators/%s&format=json&per_page=5000" % i
url_list.append(url)
result_list = []
for i in url_list:
response = requests.get(i)
print(response)
result_list.append(response.content)
#Erroneous code
result_json = []
for i in range(len(result_list)):
result_json.append(json.loads(result_list[i]))

How to print all results of Beautiful Soup at once?

I have a list of twitter usernames. I need to get their number of followers. I used BS and requests. However, I've only received one account every time.
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv', engine= 'python')
followers = purcsv['username']
followers.head(10)
handle = purcsv['username'][0:40]
temp = ("https://twitter.com/"+handle)
temp = temp.tolist()
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text,'lxml')
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
That's because you are looping over the urls first and fetching the content for each in the same variable page here:
for url in temp:
page = requests.get(url)
so page will always contain the last url page accessed, to solve this you need to process a page once fetched
followers_list = []
for url in temp:
page = requests.get(url)
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
here is a full example to verify
from bs4 import BeautifulSoup
import requests
import pandas as pd
purcsv = pd.read_csv('pureeng.csv')
followers = purcsv['username']
handles = purcsv['username'][0:40].tolist()
followers_list = []
for handle in handles:
url = "https://twitter.com/" + handle
try:
page = requests.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
bs = BeautifulSoup(page.text, "html.parser")
follow_box = bs.find('li',{'class':'ProfileNav-item ProfileNav-item--followers'})
followers = follow_box.find('a').find('span',{'class':'ProfileNav-value'})
print("Number of followers: {} ".format(followers.get('data-count')))
followers_list.append(followers.get('data-count'))
print(followers_list)
output:
Number of followers: 13714085
Number of followers: 4706511
['13714085', '4706511']
You may consider using async function for fetching and processing those urls if you have two many of them.

How to scrape multiple pages using the apple i tunes api

I'm trying to scrape the Itunes API to get information for all of the podcasts available in the Apple iTunes store. Currently, I'm only able to pull 200 at a time. When I try to grab the next 200 podcasts in the list, I'm getting the same 200 as before.
https://itunes.apple.com/search?term=podcast&limit=2
https://itunes.apple.com/search?term=podcast&limit=2&offset=1
Any suggestions would be appreciated.
import requests
import pandas as pd
import time
import json
url = 'https://itunes.apple.com/search?term=podcast&limit=2'
res = requests.get(url,headers={'User-agent': 'project'})
res.status_code
current_url = None
posts = []
the_offset = 0
for _ in range(2):
if current_url == None:
current_url = url
else:
current_url = url +'&offset={}'.format(the_offset)
res = requests.get(current_url)
if res.status_code != 200:
print('Error',res.status_code)
break
the_offset += 1
current_dict = res.json()
current_posts = {k:v for (k,v) in current_dict.items()}
posts.extend(current_posts['results'])
print(current_url)
time.sleep(3)
Try changing the offset parameter:
results = 100
limit = 10
pages = int(results / limit)
for i in pages:
offset = i+1
request(offset)

Google news crawler flip pages

continuing on previous work to crawl all news result about query and to return title and url, I am refining the crawler to get all results from all pages in Google News. Current code seems can only return the 1st page Googel news search result. Would be grateful to know how to get all pages results. Many thanks!
my codes below:
import requests
from bs4 import BeautifulSoup
import time
import datetime
from random import randint
import numpy as np
import pandas as pd
query2Google = input("What do you want from Google News?\n")
def QGN(query2Google):
s = '"'+query2Google+'"' #Keywords for query
s = s.replace(" ","+")
date = str(datetime.datetime.now().date()) #timestamp
filename =query2Google+"_"+date+"_"+'SearchNews.csv' #csv filename
f = open(filename,"wb")
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y" # URL for query of news results within one year and sort by date
#htmlpage = urllib2.urlopen(url).read()
time.sleep(randint(0, 2))#waiting
htmlpage = requests.get(url)
print("Status code: "+ str(htmlpage.status_code))
soup = BeautifulSoup(htmlpage.text,'lxml')
df = []
for result_table in soup.findAll("div", {"class": "g"}):
a_click = result_table.find("a")
#print ("-----Title----\n" + str(a_click.renderContents()))#Title
#print ("----URL----\n" + str(a_click.get("href"))) #URL
#print ("----Brief----\n" + str(result_table.find("div", {"class": "st"}).renderContents()))#Brief
#print ("Done")
df=np.append(df,[str(a_click.renderContents()).strip("b'"),str(a_click.get("href")).strip('/url?q='),str(result_table.find("div", {"class": "st"}).renderContents()).strip("b'")])
df = np.reshape(df,(-1,3))
df1 = pd.DataFrame(df,columns=['Title','URL','Brief'])
print("Search Crawl Done!")
df1.to_csv(filename, index=False,encoding='utf-8')
f.close()
return
QGN(query2Google)
There used to be an ajax api, but it's no longer avaliable .
Still , you can modify your script with a for loop if you want to get a number of pages , or a while loop if you want to get all pages .
Example :
url = "http://www.google.com.sg/search?q="+s+"&tbm=nws&tbs=qdr:y&start="
pages = 10 # the number of pages you want to crawl #
for next in range(0, pages*10, 10) :
page = url + str(next)
time.sleep(randint(1, 5)) # you may need longer than that #
htmlpage = requests.get(page) # you should add User-Agent and Referer #
print("Status code: " + str(htmlpage.status_code))
if htmlpage.status_code != 200 :
break # something went wrong #
soup = BeautifulSoup(htmlpage.text, 'lxml')
... process response here ...
next_page = soup.find('td', { 'class':'b', 'style':'text-align:left' })
if next_page is None or next_page.a is None :
break # there are no more pages #
Keep in mind that google doesn't like bots , you might get a ban .
You could add 'User-Agent' and 'Referer' in headers to simulate a web browser , and use time.sleep(random.uniform(2, 6)) to simulate a human ... or use selenium.
You can also add &num=25 to the end of your query and you'll get back a webpage with that number of results. In this example youll get back 25 google results back.

File Storage Problem with Python Web Crawler

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.
When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:
import re
import urllib2
import twitter
start_follower = "NYTimesKrugman"
depth = 3
searched = set()
api = twitter.Api()
def crawl(follower, in_depth):
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
crawl(name, in_depth-1)
crawl(start_follower, depth)
for x in searched:
print x
print "Program is completed."
However, when I run the full crawler, I do not get a separate file for each follower:
import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time
start_follower = "NYTimeskrugman"
depth = 2
searched = set()
api = twitter.Api()
def add_to_U(user):
U.append(user)
def site(follower): #creates a twitter site url in string format based on the follower username
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower): #obtains access to a webapge
url = site(follower)
response = urllib.urlopen(url)
return response
def getSoup(response): #creates the parsing module
html = response.read()
soup = BeautifulSoup(html)
return soup
def gettweets(soup, output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more') != -1:
return True
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def crawl(follower, in_depth): #main method of sorts
if in_depth > 0:
searched.add(follower)
directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
output = open(directory, 'a')
output.write(follower)
output.write('\n\n')
a = getPage(follower)
soup = getSoup(a)
gettweets(soup, output)
tweets = are_more_tweets(soup)
while(tweets):
b = getnewlink(soup)
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup, output)
tweets = are_more_tweets(soup)
users = api.GetFriends(follower)
names = set([str(u.screen_name) for u in users])
names -= searched
for name in list(names)[0:5]:
print name
crawl(name, in_depth - 1)
crawl(start_follower, depth)
print("Program done. Look at output file.")
More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!
The depth value is different between the snippet and the full code (you're only going to get one level of recursion in the full code). Also, you only grab the first five names from the followers list: for name in list(names)[0:5]: So you get six people total: the starting follower and their first five friends.

Categories