Data mining inside twitter but getting less data - python
I would like to get tweets using specific keywords, within specific time, using specific language. I have all credentials from Twitter (Developer Portal-key,token,bearer). I have tried one extraction using this code.
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
frana_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it place_country:IT',
user_fields = ['username','public_metrics','description','location','created_at','entities','url','verified'],
tweet_fields = ['created_at','geo','entities','lang','non_public_metrics','public_metrics','source'],
place_fields = ['country','place_type'],
expansions = ['author_id','geo.place_id'],
start_time = '2019-11-01T00:00:00Z',
end_time = '2019-11-30T23:59:59Z',
max_results=500):
time.sleep(1)
frana_tweets.append(response)
result = []
user_dict = {}
place_dict = {}
tweet_dict = {}
# Loop through each response object
for response in frana_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'name':user.name,
'username': user.username,
'created_at': user.created_at,
'description': user.description,
'entities': user.entities,
'location': user.location,
'pinned_tweet_id':user.pinned_tweet_id,
'protected':user.protected,
'followers_count': user.public_metrics['followers_count'],
'following_count': user.public_metrics['following_count'],
'tweet_count': user.public_metrics['tweet_count'],
'listed_count': user.public_metrics['listed_count'],
'url':user.url,
'verified':user.verified
}
for place in response.includes['places']:
place_dict[place.id] = {'geo_id':place.id,
'full_name':place.full_name,
'country': place.country,
'place_type': place.place_type
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
geo_info = place_dict[place.id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'name':author_info['name'],
'username': author_info['username'],
'author_created_at': author_info['created_at'],
'author_description': author_info['description'],
'author_entities': author_info['entities'],
'author_location': author_info['location'],
'pinned_tweet_id':author_info['pinned_tweet_id'],
'protected':author_info['protected'],
'author_followers': author_info['followers_count'],
'author_following':author_info['following_count'],
'author_tweet_count': author_info['tweet_count'],
'author_listed_count': author_info['listed_count'],
'author_url': author_info['url'],
'author_verified': author_info['verified'],
'id_text':tweet.id, #identifica il tweet
'text': tweet.text,
'created_at': tweet.created_at,
'lang':tweet.lang,
'geo':tweet.geo,
'entities':tweet.entities,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count'],
'non_public_metrics':tweet.non_public_metrics,
#'in_reply_to_user_id':tweet.in_reply_to_user_id,
'source':tweet.source,
'geo_id':geo_info['geo_id'],
'full_name':geo_info['full_name'],
'country': geo_info['country'],
'place_type': geo_info['place_type']
})
# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result)
But it didn't get all data from Twitter. I mean, some tweets haven't been extracted. Why?
I have tried another code, but I have the same problem:
import requests
import os
import json
import twitter_authentication as config
import time
import pandas as pd
# Save your bearer token in a file called twitter_authentication.py in this directory
# Should look like this:
bearer_token = 'name_Bearer_token'
bearer_token = config.bearer_token
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it'
out_file = 'raw_tweets.txt'
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': query,
'start_time': '2019-11-01T00:00:00Z',
'end_time':'2019-11-30T23:59:59Z',
'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
'expansions': 'author_id,geo.place_id',
'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'max_results': 500
}
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def connect_to_endpoint(url, headers, params, next_token = None):
if next_token:
params['next_token'] = next_token
response = requests.request("GET", search_url, headers=headers, params=params)
time.sleep(3.1)
print(response.status_code)
if response.status_code != 1000:
raise Exception(response.status_code, response.text)
return response.json()
def get_tweets(num_tweets, output_fh):
next_token = None
tweets_stored = 0
while tweets_stored < num_tweets:
headers = create_headers(bearer_token)
json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
if json_response['meta']['result_count'] == 0:
break
author_dict = {x['id']: x['username']
for x in json_response['includes']['users']}
for tweet in json_response['data']:
try:
tweet['username'] = author_dict[tweet['author_id']],
#'username': author_info['username'],
tweet['id']=tweet['entities'],
except KeyError:
print(f"No data for {tweet['author_id']}")
output_fh.write(json.dumps(tweet) + '\n')
tweets_stored += 1
try:
next_token = json_response['meta']['next_token']
except KeyError:
break
return None
def main():
with open(out_file, 'w') as f:
get_tweets(1000, f)
main()
tweets = []
with open(out_file, 'r') as f:
for row in f.readlines():
tweet = json.loads(row)
tweets.append(tweet)
tweets[0]
df4 = pd.DataFrame(tweets)
Related
How to call multiple functions and execute sql queries from random inputs from JSON?
I'm planning to call multiple functions and execute SQL queries from random inputs from JSON. Actual code: def daily(): db_connection = DatabaseConnection() aaa(db_connection) bbb(db_connection) return {'success': True} def aaa(db_connection): database_table = 'aaa' symbols_list = [{'code': 'XXX', 'database_column': 'aaa_1'}, {'code': 'YYY', 'database_column': 'aaa_2'}] load_data_from_api_to_database(db_connection, database_table, symbols_list) def load_data_from_api_to_database(db_connection, database_table, symbols_list): http_request = HttpRequest() for _, symbol in enumerate(symbols_list): code = symbol['code'] database_column = symbol['database_column'] response = http_request.get(f'https://api.example.com/value/{code}', headers={'accept': 'application/json', 'appkey': api_key}) json_data = json.loads(response.text) if response.status_code != 200: return data_points = json_data['dataPoint'] for x, _ in enumerate(data_points): value = data_points[x]['value'] date = data_points[x]['date'] db_connection.execute(f'INSERT INTO "{database_table}" ("{database_date_column}") VALUES (%(date_time)s) ON CONFLICT ("{database_date_column}") DO NOTHING', {'date_time': date}) db_connection.execute(f'UPDATE "{database_table}" SET "{database_column}" = %(value)s WHERE "{database_date_column}" = %(date_time)s', {'value': value, 'date_time': date}) db_connection.commit() bbb() similar to aaa() just different json array value. Test code: class TestDailyHandler(unittest.TestCase): #classmethod def setup_class(cls): cls.mock_get_patcher = patch('src.daily_handler.HttpRequest.get') cls.mock_get = cls.mock_get_patcher.start() cls.mock_database_connection_patcher = patch('src.daily_handler.DatabaseConnection') cls.mock_database_connection = cls.mock_database_connection_patcher.start() def load_data_from_api_to_database(self): assert load_data_from_api_to_database({}, None) == {'success': True} symbols_list = [{'code': 'XXX', 'database_column': 'aaa_1'}, {'code': 'YYY', 'database_column': 'aaa_2'}] for x in range(len(symbols_list)): code = [x]['code'] self.mock_get.assert_any_call(f'https://api.example.com/value/symbols_list{code}', headers={'accept': 'application/json', 'appkey': self.mock_get_aws_secret.return_value}) db_execute_many_args_list = self.mock_database_connection.return_value.execute_many.call_args_list daily_table_insert_command_length = len([x for x in db_execute_many_args_list if re.search(r'INSERT INTO ', str(x), re.IGNORECASE)]) self.assertEqual(daily_table_insert_command_length, len(db_execute_many_args_list)) self.assertEqual(self.mock_database_connection.return_value.commit.call_count, daily_table_insert_command_length) db_execute_many_args_list = self.mock_database_connection.return_value.execute_many.call_args_list daily_table_update_command_length = len([x for x in db_execute_many_args_list if re.search(r'UPDATE ', str(x), re.IGNORECASE)]) self.assertEqual(daily_table_update_command_length, len(db_execute_many_args_list)) self.assertEqual(self.mock_database_connection.return_value.commit.call_count, daily_table_update_command_length) By the way, I'm not sure how to call multiple functions aaa() and bbb(). I supposely to test starting from daily() instead load_data_from_api_to_database() function. Also JSON array input from each function. Currently it's static value.
Warning: the following code is provided as guidance only taken from example I have a string whose content is a function name, how to refer to the corresponding function in Python? additional reading Store functions in list and call them later new answer def aaa(db_connection, data): # bind from data symbols_list = data # ... def bbb(db_connection, data): # bind from data symbols_list = data # ... dispatcher = { "aaa" : aaa, "bbb" : bbb } def daily(): db_connection = DatabaseConnection() #loop the dict for functions for fn in dispatcher: if(callable(dispatcher[func_name])) json_str = input('Enter your JSON data:') try data = json.load(json_str); dispatcher[func_name](db_connection, data) except JSONDecodeError print('Error loading json') return None return {'success': True}
I get TwitterConnectionError when I repeat requests
The following code, using Twitter API, repeats getting tweets while it gets 'next_token'. This results in an error, but I don't know what the error indicates. I would like to know how I should modify the code. Here's my code from TwitterAPI import TwitterAPI, TwitterPager import csv import pandas as pd import time #create a dataframe to store tweets df = pd.DataFrame() api = TwitterAPI(consumer_key, consumer_secret, auth_type='oAuth2', api_version='2') #the first query without next_token q = {'query':'(#abc -RT lang:en)', 'start_time': "2017-10-16T00:00:00Z", 'end_time': "2018-03-31T23:59:59Z", 'tweet.fields':'created_at', 'max_results': '500'} r = api.request('tweets/search/all', q) response_data = r.json() df_a = pd.DataFrame(response_data['data']) #Add tweets to dataframe df = pd.concat([df, df_a]) #Loop this process while there's 'next_token' while 'next_token' in response_data["meta"]: #sleep not to exceed rate limit time.sleep(4) token = response_data["meta"]['next_token'] #query with 'next_token' q = {'query':'(#abc -RT lang:en)', 'start_time': "2017-10-16T00:00:00Z", 'end_time': "2018-03-31T23:59:59Z", 'tweet.fields':'created_at', 'max_results': '500', 'next_token': token} r = api.request('tweets/search/all', q) if r.status_code == 200: response_data = r.json() df_a = pd.DataFrame(response_data['data']) df = pd.concat([df, df_a]) else: print("ERROR: %d" % res.status_code) I got an error as below. TwitterConnectionError: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=5)
Zendesk API search result index
I write a python script to do GET and PUT method in zendesk API and successfully get the data I wanted and do some updates to the tickets. below method resulting this ticket number "6442" and put method is intended to remove the tags from urllib.parse import urlencode import json import requests # Set the credentials credentials = 'some email', 'some password' session = requests.Session() session.auth = credentials # Set the GET parameters params_noreply_window = { 'query': 'type:ticket tags:test status<closed', } params_oustide_businesshour = { 'query': 'type:ticket tags:send_whatsapp_obh status:new', } url_search1 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \ urlencode(params_noreply_window) url_search2 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \ urlencode(params_oustide_businesshour) response_noreply_window = session.get(url_search1) response_oustide_businesshour = session.get(url_search2) # ----------------------------------------------------------------------------- if response_noreply_window.status_code != 200 | response_oustide_businesshour.status_code != 200: print('Status 1:', response_noreply_window.status_code + 'Status 2:', response_oustide_businesshour.status_code, 'Problem with the request. Exiting.') exit() # Print the subject of each ticket in the results data_noreply_window = response_noreply_window.json() data_oustide_businesshour = response_oustide_businesshour.json() # Ticket to update # Create a list containing the values of the id field # for each dictionary that is an element of the list data id_merged1 = [result['id'] for result in data_noreply_window['results']] print(type(id_merged1)) print(id_merged1) id_merged2 = [result['id'] for result in data_oustide_businesshour['results']] print(type(id_merged2)) print(id_merged2) # Join value of list by using comma separated id_merged1_joined = ','.join(map(str, id_merged1)) print(id_merged1_joined) id_merged2_joined = ','.join(map(str, id_merged2)) print(id_merged2_joined) # Package the data in a dictionary matching the expected JSON data_comment1 = {"ticket": { "remove_tags": ["test"] } } data_comment2 = {"ticket": { "remove_tags": ["send_whatsapp_obh"] } } # Encode the data to create a JSON payload payload1 = json.dumps(data_comment1) payload2 = json.dumps(data_comment2) print("**Start**") # Set the request parameters url_put_comments1 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\ 'ids=' + id_merged1_joined url_put_comments2 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\ 'ids=' + id_merged2_joined user = 'some email' pwd = 'some password' headers = {'content-type': 'application/json'} # Do the HTTP put request response_request_noreply = requests.put(url_put_comments1, data=payload1, auth=(user, pwd), headers=headers) response_request_obh = requests.put(url_put_comments2, data=payload2, auth=(user, pwd), headers=headers) # Check for HTTP codes other than 200 if response_request_noreply.status_code != 200 | response_request_obh.status_code != 200: print('Status 1:', response_request_noreply.status_code + 'Status 1:', response_request_obh.status_code, 'Problem with the request. Exiting.') exit() # Report success print('Successfully added comment to tickets') However, after running my python code and do another GET method, the same ticket number still appears and I need to wait in random time to get the result I intend which is return 'null' since I have updated the ticket by using PUT method. Can anyone explain me how does the Zendesk API works? and my apology for my incorrect sentences in explaining my concern.
Why raise_for_status() did not catch the error?
Trying to check for none 200 Response in the current_track() function. What could be a problem? It throwing JSONDecodeError error. But if I understood raise_for_ status correctly it should have prevented the function from trying to load a JSON from a faulty web-page? If I run the script without this check and with uncommenting lines check_playback() it successfully catches JSONDecodeError. The script is fetching data from Spotify and putting it to the status on vk.com. import config import webbrowser import requests import furl import secrets import string import time import os import simplejson as json URL_CODE_BASE_VK = 'https://oauth.vk.com/authorize' URL_CODE_BASE_SP = 'https://accounts.spotify.com/authorize' URL_TOKEN_VK = 'https://oauth.vk.com/access_token' URL_TOKEN_SP = 'https://accounts.spotify.com/api/token' URL_TRACK = 'https://api.spotify.com/v1/me/player/currently-playing' URL_STATUS = 'https://api.vk.com/method/status.set' EXP_IN_TOKEN_SP = 3400 EXP_IN_TOKEN_VK = 86400 FILE_TOKEN_VK = 'vk_token.json' FILE_TOKEN_SP = 'sp_token.json' def get_auth_code_vk(): url_code_params = { 'client_id': config.CLIENT_ID_VK, 'response_type': 'code', 'redirect_uri': 'https://oauth.vk.com/blank.html', 'v': 5.92, 'scope': 'status', 'state': gen_state(), 'display': 'page' } code = url_open(URL_CODE_BASE_VK, url_code_params) return parse_code(code) def get_auth_code_sp(): url_code_params = { 'client_id': config.CLIENT_ID_SP, 'response_type': 'code', 'redirect_uri': 'https://www.spotify.com/', 'scope': 'user-read-currently-playing', 'state': gen_state() } code = url_open(URL_CODE_BASE_SP, url_code_params) return parse_code(code) def gen_state(): symbols = string.ascii_lowercase + string.digits return ''.join(secrets.choice(symbols) for _ in range(12)) def url_open(url_base, url_params): url_code_full = furl.furl(url_base).add(url_params).url webbrowser.open_new_tab(url_code_full) input_url = input('Enter the whole URL, that you have been redirected on: ') return input_url def parse_code(url): return (url.split("code=")[1]).split("&state=")[0] def get_token_vk(): data = { 'grant_type': 'authorization_code', 'code': get_auth_code_vk(), 'redirect_uri': 'https://oauth.vk.com/blank.html', 'client_id': 6782333, 'client_secret': config.CLIENT_SECRET_VK } response = requests.post(url=URL_TOKEN_VK, data=data).json() write_file(FILE_TOKEN_VK, response) def get_token_sp(): data = { 'grant_type': 'authorization_code', 'code': get_auth_code_sp(), 'redirect_uri': 'https://www.spotify.com/', 'client_id': config.CLIENT_ID_SP, 'client_secret': config.CLIENT_SECRET_SP } response = requests.post(url=URL_TOKEN_SP, data=data).json() write_file(FILE_TOKEN_SP, response) def write_file(tkn_file, response): dict = {} dict['token'] = response["access_token"] dict['time'] = time.time() with open(tkn_file, 'w') as file: file.write(json.dumps(dict)) def load_file(tkn_file): with open(tkn_file) as file: data = json.load(file) return data def set_status(): params = { 'v': 5.92, 'access_token': load_file(FILE_TOKEN_VK)['token'], 'text': current_track() } set_status = requests.get(url=URL_STATUS, params=params) def track_data(): tkn_file = load_file(FILE_TOKEN_SP)['token'] headers = { 'Accept': 'application/json', 'Authorization': f'Bearer {tkn_file}' } return requests.get(url=URL_TRACK, headers=headers) def current_track(): response = track_data() print(response) try: response.raise_for_status() except requests.exceptions.HTTPError as e: return "Error: " + str(e) # data = track_data().json() data = response.json() artist = data['item']['artists'][0]['name'] track = data['item']['name'] return(f'{artist} - {track}') def check_playback(): set_status() print(current_track()) # try: # set_status() # print(current_track()) # except json.decoder.JSONDecodeError: # print('Not playing') def token_missing(file): return not os.path.isfile(file) def token_expired(file, exp_in): return time.time() - load_file(file)['time'] > exp_in def token_not_valid(file, exp_in): return token_missing(file) or token_expired(file, exp_in) def run_script(): if token_not_valid(FILE_TOKEN_VK, EXP_IN_TOKEN_VK): get_token_vk() if token_not_valid(FILE_TOKEN_SP, EXP_IN_TOKEN_SP): get_token_sp() check_playback() if __name__ == "__main__": run_script() Error screen
raise_for_status() will only raise an exception if the server reported an error to you (and even then, only if it actually followed the HTTP spec and returned a HTTP error code). There is no way for the library to know that the response is incorrect. Even if it was correctly formatted JSON, it can't know what schema you expect it to follow (what fields should be present, and what types those fields should have). Even if it knew the schema and had verified it, there is no way for it to know that the data is actually correct and not made up on the spot.
getting error while fetching full data with my python function?
I am trying to fetch product data from an api. By default this api returns 20 products and in a single request the api can return max 500 products if we use api's parameter Limit=500. So for fetching all products we need to use one more parameter with Limit- Offset(Number of products to skip). I have written following function to achieve this but in case of full data my function is not working well and it's giving me error like- Login failed, Signature mismatching. def get_data(userid, api_key, action, pagination=True): timeformat = datetime.datetime.now().replace(microsecond=0).isoformat() + '+08:00' endpoint = 'https://example.com' page_json = {} # set required parameters for this api parameters = { 'UserID': userid, 'Version': '1.0', 'Action': action, 'Format': 'JSON', 'Timestamp': timeformat } if pagination: page = 0 parameters['Limit'] = 500 while True: parameters['Offset'] = 500 * page # set the required cryptographic signature concatenated = urllib.parse.urlencode(sorted(parameters.items())) parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest() page += 1 try: response = requests.get(endpoint, params=parameters) page_json = response.json() except requests.exceptions.ConnectionError: print("Connection refused!") sleep(5) else: try: concatenated = urllib.parse.urlencode(sorted(parameters.items())) # set the required cryptographic signature parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest() response = requests.get(endpoint, params=parameters) page_json = response.json() except requests.exceptions.ConnectionError: print("Connection refused!") sleep(5) return page_json It looks like I am not fitting my signature parameter line correctly in case of full data.I printed the value of concatenated and it looks like- page is 1 concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=500&Signature=3d9cd320a4bf816aeea828b9392ed2d5a27cd584b3a337338909c0ab161a101e&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0 try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197313552686ec46f'}}} page is 2 concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=1000&Signature=c1bda1a5ab21c4e4182cc82ca7ba87cb9fc6c5f24c36f9bb006f9da906cf7083&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0 try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197321748243ec3a5'}}} Can you please look into my function and help me to find out what I have written wrong and what it should be like?
Try this please: if pagination: page = 0 parameters['Limit'] = 500 while True: parameters['Offset'] = 500 * page # set the required cryptographic signature concatenated = urllib.parse.urlencode(sorted(parameters.items())) parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest() page += 1 try: response = requests.get(endpoint, params=parameters) page_json = response.json() except requests.exceptions.ConnectionError: print("Connection refused!") sleep(5) del parameters['Signature']