I get TwitterConnectionError when I repeat requests - python

The following code, using Twitter API, repeats getting tweets while it gets 'next_token'. This results in an error, but I don't know what the error indicates. I would like to know how I should modify the code.
Here's my code
from TwitterAPI import TwitterAPI, TwitterPager
import csv
import pandas as pd
import time
#create a dataframe to store tweets
df = pd.DataFrame()
api = TwitterAPI(consumer_key,
consumer_secret,
auth_type='oAuth2',
api_version='2')
#the first query without next_token
q = {'query':'(#abc -RT lang:en)',
'start_time': "2017-10-16T00:00:00Z",
'end_time': "2018-03-31T23:59:59Z", 'tweet.fields':'created_at', 'max_results': '500'}
r = api.request('tweets/search/all', q)
response_data = r.json()
df_a = pd.DataFrame(response_data['data'])
#Add tweets to dataframe
df = pd.concat([df, df_a])
#Loop this process while there's 'next_token'
while 'next_token' in response_data["meta"]:
#sleep not to exceed rate limit
time.sleep(4)
token = response_data["meta"]['next_token']
#query with 'next_token'
q = {'query':'(#abc -RT lang:en)',
'start_time': "2017-10-16T00:00:00Z",
'end_time': "2018-03-31T23:59:59Z",
'tweet.fields':'created_at',
'max_results': '500',
'next_token': token}
r = api.request('tweets/search/all', q)
if r.status_code == 200:
response_data = r.json()
df_a = pd.DataFrame(response_data['data'])
df = pd.concat([df, df_a])
else:
print("ERROR: %d" % res.status_code)
I got an error as below.
TwitterConnectionError: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=5)

Related

How to pass multiple values to a API call in get method?

This is my list:
unique_IMO = [94229,95986,96967,94731,95731,96612]
I need to pass these numbers to the following request:
url = 'https://api.lloydslistintelligence.com/v1/aispositionhistory?output=json&vesselImo={0}&pageNumber={1}'.format(unique_IMO,1)
I was able to call the endpoint for each number using a for loop but I don't know how to pass all the numbers at once.
I tried the below code but it still gave an error.
test1 = format(','.join(map(str,unique_IMO)))
Can someone please help me with this?
I have a list of numbers which I am trying to pass all at once to an API call. I did check using Postman to see if the endpoint accepts multiple values and it does.
API documentation snippet
So below is what I'm doing right now and it works. I am trying to make the api calls faster/efficient.
df_list = []
for ind,row in vessels.iterrows():
vesselImo = int(row['Imo'])
#Retrieve data from aispositionhistory endpoint
vessel_hist = pd.DataFrame()
total_recs = 0
for date_string in date_list:
url = 'https://api.lloydslistintelligence.com/v1/aispositionhistory?output=json&vesselImo={0}&dateRange={1}&pageNumber={2}'.format(vesselImo,date_string,1)
head = {'Authorization': '{}'.format(api_token)}
response = requests.get(url, headers=head)
#****DEBUGGING****
#print("status code: ", response.status_code )
if(response.json()['Data']['totalRecords'] != 0):
tmp = response.json()['Data']['items']
df = json_normalize(tmp)
vessel_hist = vessel_hist.append(df,ignore_index=True)
#Get reported number of records for validation
total_recs = total_recs + response.json()['Data']['totalRecords']
#Identify if API response is multiple pages
if(response.json()['Data']['totalPages'] > 1):
num_pages = response.json()['Data']['totalPages']
#print('API pull had more than one page: ' + date_string)
for page_no in range(2,num_pages+1):
url = 'https://api.lloydslistintelligence.com/v1/aispositionhistory?output=json&vesselImo={0}&dateRange={1}&pageNumber={2}'.format(vesselImo,date_string,1)
response = requests.get(url, headers=head)
tmp = response.json()['Data']['items']
df = json_normalize(tmp)
vessel_hist = vessel_hist.append(df,ignore_index=True)
# Validation based on record count
if(total_recs != vessel_hist.shape[0]):
print('Validation Error: reported records do not match dataframe')
if(vessel_hist.shape[0]>0):
#Format Dataframe
new_columns = ['vesselId','MMSI','PositionTimestamp','Latitude','Longitude','Speed','Course','Rot','Heading',
'nearestPlace','nearestPlaceId','nearestCountry','Distance','Destination','Eta','Draught',
'Dimensions','Status','Ship_type','Source']
vessel_hist.columns = new_columns
vessel_hist = vessel_hist[['MMSI','PositionTimestamp','Status','Latitude','Longitude','Speed','Course','Rot',
'Heading','Draught','Destination','Eta','Source','Ship_type','Dimensions',
'Distance','nearestCountry','nearestPlace','nearestPlaceId','vesselId']]
vessel_hist['PositionTimestamp'] = pd.to_datetime(vessel_hist['PositionTimestamp'],dayfirst=False)
vessel_hist.sort_values('PositionTimestamp', inplace=True)
vessel_hist.reset_index(drop=True, inplace=True)
df_list.append(vessel_hist)
print('Input vessel Id: ' + str(vesselImo))
print('Input Date Range: ' + start_input + ' - ' + end_input)
print('No. of AIS records: ' + str(vessel_hist.shape[0]))
df_list
vessels is a dataframe which contains the IMO numbers
vessels = pd.DataFrame((94229,95986,96967,94731,95731,96612),columns=['Imo'])
date_list is a list created based on the desired time range.
Hope this example will help
import requests
def main():
unique_IMO = [94229, 95986, 96967, 94731, 95731, 96612]
base_url = "http://httpbin.org"
query_params = {
"output": "json",
"vesselImo": unique_IMO,
"pagerNumber": 1
}
response = requests.get(url=base_url + "/get", params=query_params)
print(response.json())
if __name__ == '__main__':
main()
GET query parameters will be:
{'args': {'output': 'json', 'pagerNumber': '1', 'vesselImo': ['94229', '95986', '96967', '94731', '95731', '96612']}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.27.1', 'X-Amzn-Trace-Id': 'Root=1-633bc95b-2771014e17fa5dc6580e4e3e'}, 'origin': 'x.x.x.x', 'url': 'http://httpbin.org/get?output=json&vesselImo=94229&vesselImo=95986&vesselImo=96967&vesselImo=94731&vesselImo=95731&vesselImo=96612&pagerNumber=1'}```

Data mining inside twitter but getting less data

I would like to get tweets using specific keywords, within specific time, using specific language. I have all credentials from Twitter (Developer Portal-key,token,bearer). I have tried one extraction using this code.
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
frana_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it place_country:IT',
user_fields = ['username','public_metrics','description','location','created_at','entities','url','verified'],
tweet_fields = ['created_at','geo','entities','lang','non_public_metrics','public_metrics','source'],
place_fields = ['country','place_type'],
expansions = ['author_id','geo.place_id'],
start_time = '2019-11-01T00:00:00Z',
end_time = '2019-11-30T23:59:59Z',
max_results=500):
time.sleep(1)
frana_tweets.append(response)
result = []
user_dict = {}
place_dict = {}
tweet_dict = {}
# Loop through each response object
for response in frana_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'name':user.name,
'username': user.username,
'created_at': user.created_at,
'description': user.description,
'entities': user.entities,
'location': user.location,
'pinned_tweet_id':user.pinned_tweet_id,
'protected':user.protected,
'followers_count': user.public_metrics['followers_count'],
'following_count': user.public_metrics['following_count'],
'tweet_count': user.public_metrics['tweet_count'],
'listed_count': user.public_metrics['listed_count'],
'url':user.url,
'verified':user.verified
}
for place in response.includes['places']:
place_dict[place.id] = {'geo_id':place.id,
'full_name':place.full_name,
'country': place.country,
'place_type': place.place_type
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
geo_info = place_dict[place.id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'name':author_info['name'],
'username': author_info['username'],
'author_created_at': author_info['created_at'],
'author_description': author_info['description'],
'author_entities': author_info['entities'],
'author_location': author_info['location'],
'pinned_tweet_id':author_info['pinned_tweet_id'],
'protected':author_info['protected'],
'author_followers': author_info['followers_count'],
'author_following':author_info['following_count'],
'author_tweet_count': author_info['tweet_count'],
'author_listed_count': author_info['listed_count'],
'author_url': author_info['url'],
'author_verified': author_info['verified'],
'id_text':tweet.id, #identifica il tweet
'text': tweet.text,
'created_at': tweet.created_at,
'lang':tweet.lang,
'geo':tweet.geo,
'entities':tweet.entities,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count'],
'non_public_metrics':tweet.non_public_metrics,
#'in_reply_to_user_id':tweet.in_reply_to_user_id,
'source':tweet.source,
'geo_id':geo_info['geo_id'],
'full_name':geo_info['full_name'],
'country': geo_info['country'],
'place_type': geo_info['place_type']
})
# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result)
But it didn't get all data from Twitter. I mean, some tweets haven't been extracted. Why?
I have tried another code, but I have the same problem:
import requests
import os
import json
import twitter_authentication as config
import time
import pandas as pd
# Save your bearer token in a file called twitter_authentication.py in this directory
# Should look like this:
bearer_token = 'name_Bearer_token'
bearer_token = config.bearer_token
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it'
out_file = 'raw_tweets.txt'
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': query,
'start_time': '2019-11-01T00:00:00Z',
'end_time':'2019-11-30T23:59:59Z',
'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
'expansions': 'author_id,geo.place_id',
'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'max_results': 500
}
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def connect_to_endpoint(url, headers, params, next_token = None):
if next_token:
params['next_token'] = next_token
response = requests.request("GET", search_url, headers=headers, params=params)
time.sleep(3.1)
print(response.status_code)
if response.status_code != 1000:
raise Exception(response.status_code, response.text)
return response.json()
def get_tweets(num_tweets, output_fh):
next_token = None
tweets_stored = 0
while tweets_stored < num_tweets:
headers = create_headers(bearer_token)
json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
if json_response['meta']['result_count'] == 0:
break
author_dict = {x['id']: x['username']
for x in json_response['includes']['users']}
for tweet in json_response['data']:
try:
tweet['username'] = author_dict[tweet['author_id']],
#'username': author_info['username'],
tweet['id']=tweet['entities'],
except KeyError:
print(f"No data for {tweet['author_id']}")
output_fh.write(json.dumps(tweet) + '\n')
tweets_stored += 1
try:
next_token = json_response['meta']['next_token']
except KeyError:
break
return None
def main():
with open(out_file, 'w') as f:
get_tweets(1000, f)
main()
tweets = []
with open(out_file, 'r') as f:
for row in f.readlines():
tweet = json.loads(row)
tweets.append(tweet)
tweets[0]
df4 = pd.DataFrame(tweets)

Google analytics api crashes everytime

I am using google analytics API to extract data. I would like to dump a year's data into the csv file. I have implemented the splitting too , to handle large data. When I run the code, it starts the dumping the data into the csv. However, it crashes with the following message. And this happens everytime I run the code. Also, it must be noted, I tried extracting the data for 1 day and still hit the same error
googleapiclient.errors.HttpError: <HttpError 503 when requesting https://analyticsreporting.googleapis.com/v4/reports:batchGet?alt=json returned "The service is currently unavailable.">
The following is my code. Any help on this would be greatly appreciated
import httplib2 as lib2
import google.oauth2.credentials
from google_auth_httplib2 import AuthorizedHttp
from datetime import datetime
import psycopg2
#Packages needed for connecting with Google API
from googleapiclient.discovery import build as google_build
#Data processing packages
import pandas
import numpy
import json
from datetime import datetime, timedelta
access_token = "***********"
refresh_token = "**********"
client_id = "***********"
client_secret = "*************"
token_uri = 'https://oauth2.googleapis.com/token'
token_expiry = datetime.now() - timedelta(days = 1)
#¯\_(ツ)_/¯
user_agent = 'my-user-agent/1.0'
credentials = google.oauth2.credentials.Credentials(access_token,
refresh_token=refresh_token,
token_uri='https://oauth2.googleapis.com/token',
client_id=client_id,
client_secret=client_secret)
#Authorize client
authorized = AuthorizedHttp(credentials=credentials)
api_name = 'analyticsreporting'
api_version = 'v4'
#Let's build the client
api_client_1dayactiveusers = google_build(serviceName=api_name, version=api_version, http=authorized)
pageToken_1dayactiveusers='firstcall'
# for user types
while pageToken_1dayactiveusers != None:
sample_request = {
'viewId': '**********',
'dateRanges': {
'startDate': datetime.strftime(datetime.now() - timedelta(days = 365),'%Y-%m-%d'),
'endDate': datetime.strftime(datetime.now(),'%Y-%m-%d')
},
'dimensions': [{'name': 'ga:date'}],
'metrics': [{'expression': 'ga:1dayUsers','alias':'onedayusers'}],
'pageToken': pageToken_1dayactiveusers
}
response_1dayactiveusers = api_client_1dayactiveusers.reports().batchGet(
body={
'reportRequests': sample_request
}).execute()
print(response_1dayactiveusers)
pageToken = response_1dayactiveusers.get("reports")[0].get('nextPageToken', None)
print(pageToken)
def parse_response(report):
"""Parses and prints the Analytics Reporting API V4 response"""
# Initialize results, in list format because two dataframes might return
result_list = []
# Initialize empty data container for the two dateranges (if there are two that is)
data_csv = []
data_csv2 = []
# Initialize header rows
header_row = []
# Get column headers, metric headers, and dimension headers.
columnHeader = report.get('columnHeader', {})
metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
dimensionHeaders = columnHeader.get('dimensions', [])
# Combine all of those headers into the header_row, which is in a list format
for dheader in dimensionHeaders:
header_row.append(dheader)
for mheader in metricHeaders:
header_row.append(mheader['name'])
# Get data from each of the rows, and append them into a list
rows = report.get('data', {}).get('rows', [])
for row in rows:
row_temp = []
dimensions = row.get('dimensions', [])
metrics = row.get('metrics', [])
for d in dimensions:
row_temp.append(d)
for m in metrics[0]['values']:
row_temp.append(m)
data_csv.append(row_temp)
# Putting those list formats into pandas dataframe, and append them into the final result
result_df = pandas.DataFrame(data_csv, columns=header_row)
result_list.append(result_df)
return result_list
response_data = response_1dayactiveusers.get('reports', [])[0]
df = parse_response(response_data)[0]
df.to_csv('/Users/ga_csv_2.csv', mode='a', header=False)

Scraping Pricing off a search Bar - reached server limit

With help on Stackoverflow, I was able to come up with the scraper. The code returns a list of part numbers and its corresponding prices.
part1 price1
part2 price2
...
...
partn pricen
However the website seems to only allow 200 requests - when i raise the limit to 200+ i would get the error: "raise JSONDecodeError("Expecting value", s, err.value) from None JSONDecodeError: Expecting value".
I just want to know if there's a way to avoid this error? If not I can raise start:0 by 200 each time, but since I would have 100k+ items easily it won't be very efficient..is there a way I can loop the limit and the start function?
Please see the codes below, any help appreciated!
import requests
# import pprint # to format data on screen `pprint.pprint()
import pandas as pd
# --- fucntions ---
def get_data(query):
"""Get data from server"""
payload = {
# "facets":[{
# "name":"OEM",
# "value":"GE%20Healthcare"
# }],
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
# "urlParams":[{
# "name": "OEM",
# "value": "GE Healthcare"
# }],
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
data = r.json()
return data
all_queries = ['GE Healthcare']
for query in all_queries:
#print('\n--- QUERY:', query, '---\n')
data = get_data(query)
Part_Num = []
Vendor_Item_Num = []
price = []
for item in data['products']:
if not item['options']:
Part_Num.append([])
Vendor_Item_Num.append([])
price.append([])
else:
all_prices = [option['price'] for option in item['options']]
all_vendor = [option['price'] for option in item['options']]
all_part_num = item['partNumber']
Part_Num.append(all_part_num)
Vendor_Item_Num.append(all_vendor)
price.append(all_prices)
list_of_dataframes = [pd.DataFrame(Part_Num),pd.DataFrame(price)]
pd.concat(list_of_dataframes, axis=1).to_csv(r'C:\Users\212677036\Documents\output7.csv')
You should always check the status_code that your request was successful. The API is giving HTTP 500 when limit is > 200. status codes. You need to study the documentation of the API. Many APIs limit requests per second and maximum request size so they can maintain a reliable service.
The json() method will fail if the HTTP request was not successful.
You can get data in batches. Sample code below I stop because I have no want to stay in the loop for 500+ iterations... You could consider using threading so it's not so sequential.
All of this is covered in SO prodasf-vip
import requests
query = 'GE Healthcare'
payload = {
"facets":[],
"facilityId": 38451,
"id_ins": "a2a3d332-73a7-4194-ad87-fe7412388916",
"limit": 200,
"query": query,
"referer": "/catalog/Service",
"start": 0,
"urlParams":[]
}
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
js = r.json()
df = pd.json_normalize(js["products"])
while len(df) < js["totalResults"] and len(df)<2000:
payload["start"] += 200
r = requests.post('https://prodasf-vip.partsfinder.com/Orion/CatalogService/api/v1/search', json=payload)
if r.status_code == 200:
df = pd.concat([df, pd.json_normalize(r.json()["products"])])
else:
break
print(f"want: {js['totalResults']} got: {len(df)}")
df

getting error while fetching full data with my python function?

I am trying to fetch product data from an api.
By default this api returns 20 products and in a single request the api can return max 500 products if we use api's parameter Limit=500.
So for fetching all products we need to use one more parameter with Limit- Offset(Number of products to skip).
I have written following function to achieve this but in case of full data my function is not working well and it's giving me error like- Login failed, Signature mismatching.
def get_data(userid, api_key, action, pagination=True):
timeformat = datetime.datetime.now().replace(microsecond=0).isoformat() + '+08:00'
endpoint = 'https://example.com'
page_json = {}
# set required parameters for this api
parameters = {
'UserID': userid,
'Version': '1.0',
'Action': action,
'Format': 'JSON',
'Timestamp': timeformat
}
if pagination:
page = 0
parameters['Limit'] = 500
while True:
parameters['Offset'] = 500 * page
# set the required cryptographic signature
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
page += 1
try:
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
else:
try:
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
# set the required cryptographic signature
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
return page_json
It looks like I am not fitting my signature parameter line correctly in case of full data.I printed the value of concatenated and it looks like-
page is 1
concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=500&Signature=3d9cd320a4bf816aeea828b9392ed2d5a27cd584b3a337338909c0ab161a101e&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0
try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197313552686ec46f'}}}
page is 2
concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=1000&Signature=c1bda1a5ab21c4e4182cc82ca7ba87cb9fc6c5f24c36f9bb006f9da906cf7083&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0
try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197321748243ec3a5'}}}
Can you please look into my function and help me to find out what I have written wrong and what it should be like?
Try this please:
if pagination:
page = 0
parameters['Limit'] = 500
while True:
parameters['Offset'] = 500 * page
# set the required cryptographic signature
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
page += 1
try:
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
del parameters['Signature']

Categories