Data mining inside twitter but getting less data

Data mining inside twitter but getting less data - python

I would like to get tweets using specific keywords, within specific time, using specific language. I have all credentials from Twitter (Developer Portal-key,token,bearer). I have tried one extraction using this code.
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)
frana_tweets = []
for response in tweepy.Paginator(client.search_all_tweets,
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it place_country:IT',
user_fields = ['username','public_metrics','description','location','created_at','entities','url','verified'],
tweet_fields = ['created_at','geo','entities','lang','non_public_metrics','public_metrics','source'],
place_fields = ['country','place_type'],
expansions = ['author_id','geo.place_id'],
start_time = '2019-11-01T00:00:00Z',
end_time = '2019-11-30T23:59:59Z',
max_results=500):
time.sleep(1)
frana_tweets.append(response)
result = []
user_dict = {}
place_dict = {}
tweet_dict = {}
# Loop through each response object
for response in frana_tweets:
# Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
for user in response.includes['users']:
user_dict[user.id] = {'name':user.name,
'username': user.username,
'created_at': user.created_at,
'description': user.description,
'entities': user.entities,
'location': user.location,
'pinned_tweet_id':user.pinned_tweet_id,
'protected':user.protected,
'followers_count': user.public_metrics['followers_count'],
'following_count': user.public_metrics['following_count'],
'tweet_count': user.public_metrics['tweet_count'],
'listed_count': user.public_metrics['listed_count'],
'url':user.url,
'verified':user.verified
}
for place in response.includes['places']:
place_dict[place.id] = {'geo_id':place.id,
'full_name':place.full_name,
'country': place.country,
'place_type': place.place_type
}
for tweet in response.data:
# For each tweet, find the author's information
author_info = user_dict[tweet.author_id]
geo_info = place_dict[place.id]
# Put all of the information we want to keep in a single dictionary for each tweet
result.append({'author_id': tweet.author_id,
'name':author_info['name'],
'username': author_info['username'],
'author_created_at': author_info['created_at'],
'author_description': author_info['description'],
'author_entities': author_info['entities'],
'author_location': author_info['location'],
'pinned_tweet_id':author_info['pinned_tweet_id'],
'protected':author_info['protected'],
'author_followers': author_info['followers_count'],
'author_following':author_info['following_count'],
'author_tweet_count': author_info['tweet_count'],
'author_listed_count': author_info['listed_count'],
'author_url': author_info['url'],
'author_verified': author_info['verified'],
'id_text':tweet.id, #identifica il tweet
'text': tweet.text,
'created_at': tweet.created_at,
'lang':tweet.lang,
'geo':tweet.geo,
'entities':tweet.entities,
'retweets': tweet.public_metrics['retweet_count'],
'replies': tweet.public_metrics['reply_count'],
'likes': tweet.public_metrics['like_count'],
'quote_count': tweet.public_metrics['quote_count'],
'non_public_metrics':tweet.non_public_metrics,
#'in_reply_to_user_id':tweet.in_reply_to_user_id,
'source':tweet.source,
'geo_id':geo_info['geo_id'],
'full_name':geo_info['full_name'],
'country': geo_info['country'],
'place_type': geo_info['place_type']
})
# Change this list of dictionaries into a dataframe
df4 = pd.DataFrame(result)
But it didn't get all data from Twitter. I mean, some tweets haven't been extracted. Why?
I have tried another code, but I have the same problem:
import requests
import os
import json
import twitter_authentication as config
import time
import pandas as pd
# Save your bearer token in a file called twitter_authentication.py in this directory
# Should look like this:
bearer_token = 'name_Bearer_token'
bearer_token = config.bearer_token
query = 'frana OR smottamento OR scivolamento OR crollo OR dissesto lang:it'
out_file = 'raw_tweets.txt'
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
query_params = {'query': query,
'start_time': '2019-11-01T00:00:00Z',
'end_time':'2019-11-30T23:59:59Z',
'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
'expansions': 'author_id,geo.place_id',
'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'max_results': 500
}
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
def connect_to_endpoint(url, headers, params, next_token = None):
if next_token:
params['next_token'] = next_token
response = requests.request("GET", search_url, headers=headers, params=params)
time.sleep(3.1)
print(response.status_code)
if response.status_code != 1000:
raise Exception(response.status_code, response.text)
return response.json()
def get_tweets(num_tweets, output_fh):
next_token = None
tweets_stored = 0
while tweets_stored < num_tweets:
headers = create_headers(bearer_token)
json_response = connect_to_endpoint(search_url, headers, query_params, next_token)
if json_response['meta']['result_count'] == 0:
break
author_dict = {x['id']: x['username']
for x in json_response['includes']['users']}
for tweet in json_response['data']:
try:
tweet['username'] = author_dict[tweet['author_id']],
#'username': author_info['username'],
tweet['id']=tweet['entities'],
except KeyError:
print(f"No data for {tweet['author_id']}")
output_fh.write(json.dumps(tweet) + '\n')
tweets_stored += 1
try:
next_token = json_response['meta']['next_token']
except KeyError:
break
return None
def main():
with open(out_file, 'w') as f:
get_tweets(1000, f)
main()
tweets = []
with open(out_file, 'r') as f:
for row in f.readlines():
tweet = json.loads(row)
tweets.append(tweet)
tweets[0]
df4 = pd.DataFrame(tweets)

Related

How to call multiple functions and execute sql queries from random inputs from JSON?

I'm planning to call multiple functions and execute SQL queries from random inputs from JSON.
Actual code:
def daily():
db_connection = DatabaseConnection()
aaa(db_connection)
bbb(db_connection)
return {'success': True}
def aaa(db_connection):
database_table = 'aaa'
symbols_list = [{'code': 'XXX', 'database_column': 'aaa_1'}, {'code': 'YYY', 'database_column': 'aaa_2'}]
load_data_from_api_to_database(db_connection, database_table, symbols_list)
def load_data_from_api_to_database(db_connection, database_table, symbols_list):
http_request = HttpRequest()
for _, symbol in enumerate(symbols_list):
code = symbol['code']
database_column = symbol['database_column']
response = http_request.get(f'https://api.example.com/value/{code}', headers={'accept': 'application/json', 'appkey': api_key})
json_data = json.loads(response.text)
if response.status_code != 200:
return
data_points = json_data['dataPoint']
for x, _ in enumerate(data_points):
value = data_points[x]['value']
date = data_points[x]['date']
db_connection.execute(f'INSERT INTO "{database_table}" ("{database_date_column}") VALUES (%(date_time)s) ON CONFLICT ("{database_date_column}") DO NOTHING', {'date_time': date})
db_connection.execute(f'UPDATE "{database_table}" SET "{database_column}" = %(value)s WHERE "{database_date_column}" = %(date_time)s', {'value': value, 'date_time': date})
db_connection.commit()
bbb() similar to aaa() just different json array value.
Test code:
class TestDailyHandler(unittest.TestCase):
#classmethod
def setup_class(cls):
cls.mock_get_patcher = patch('src.daily_handler.HttpRequest.get')
cls.mock_get = cls.mock_get_patcher.start()
cls.mock_database_connection_patcher = patch('src.daily_handler.DatabaseConnection')
cls.mock_database_connection = cls.mock_database_connection_patcher.start()
def load_data_from_api_to_database(self):
assert load_data_from_api_to_database({}, None) == {'success': True}
symbols_list = [{'code': 'XXX', 'database_column': 'aaa_1'}, {'code': 'YYY', 'database_column': 'aaa_2'}]
for x in range(len(symbols_list)):
code = [x]['code']
self.mock_get.assert_any_call(f'https://api.example.com/value/symbols_list{code}', headers={'accept': 'application/json', 'appkey': self.mock_get_aws_secret.return_value})
db_execute_many_args_list = self.mock_database_connection.return_value.execute_many.call_args_list
daily_table_insert_command_length = len([x for x in db_execute_many_args_list if re.search(r'INSERT INTO ', str(x), re.IGNORECASE)])
self.assertEqual(daily_table_insert_command_length, len(db_execute_many_args_list))
self.assertEqual(self.mock_database_connection.return_value.commit.call_count, daily_table_insert_command_length)
db_execute_many_args_list = self.mock_database_connection.return_value.execute_many.call_args_list
daily_table_update_command_length = len([x for x in db_execute_many_args_list if re.search(r'UPDATE ', str(x), re.IGNORECASE)])
self.assertEqual(daily_table_update_command_length, len(db_execute_many_args_list))
self.assertEqual(self.mock_database_connection.return_value.commit.call_count, daily_table_update_command_length)
By the way, I'm not sure how to call multiple functions aaa() and bbb(). I supposely to test starting from daily() instead load_data_from_api_to_database() function. Also JSON array input from each function. Currently it's static value.

Warning: the following code is provided as guidance only
taken from example I have a string whose content is a function name, how to refer to the corresponding function in Python?
additional reading Store functions in list and call them later
new answer
def aaa(db_connection, data):
# bind from data
symbols_list = data
# ...
def bbb(db_connection, data):
# bind from data
symbols_list = data
# ...
dispatcher = { "aaa" : aaa, "bbb" : bbb }
def daily():
db_connection = DatabaseConnection()
#loop the dict for functions
for fn in dispatcher:
if(callable(dispatcher[func_name]))
json_str = input('Enter your JSON data:')
try
data = json.load(json_str);
dispatcher[func_name](db_connection, data)
except JSONDecodeError
print('Error loading json')
return None
return {'success': True}

I get TwitterConnectionError when I repeat requests

The following code, using Twitter API, repeats getting tweets while it gets 'next_token'. This results in an error, but I don't know what the error indicates. I would like to know how I should modify the code.
Here's my code
from TwitterAPI import TwitterAPI, TwitterPager
import csv
import pandas as pd
import time
#create a dataframe to store tweets
df = pd.DataFrame()
api = TwitterAPI(consumer_key,
consumer_secret,
auth_type='oAuth2',
api_version='2')
#the first query without next_token
q = {'query':'(#abc -RT lang:en)',
'start_time': "2017-10-16T00:00:00Z",
'end_time': "2018-03-31T23:59:59Z", 'tweet.fields':'created_at', 'max_results': '500'}
r = api.request('tweets/search/all', q)
response_data = r.json()
df_a = pd.DataFrame(response_data['data'])
#Add tweets to dataframe
df = pd.concat([df, df_a])
#Loop this process while there's 'next_token'
while 'next_token' in response_data["meta"]:
#sleep not to exceed rate limit
time.sleep(4)
token = response_data["meta"]['next_token']
#query with 'next_token'
q = {'query':'(#abc -RT lang:en)',
'start_time': "2017-10-16T00:00:00Z",
'end_time': "2018-03-31T23:59:59Z",
'tweet.fields':'created_at',
'max_results': '500',
'next_token': token}
r = api.request('tweets/search/all', q)
if r.status_code == 200:
response_data = r.json()
df_a = pd.DataFrame(response_data['data'])
df = pd.concat([df, df_a])
else:
print("ERROR: %d" % res.status_code)
I got an error as below.
TwitterConnectionError: HTTPSConnectionPool(host='api.twitter.com', port=443): Read timed out. (read timeout=5)

Zendesk API search result index

I write a python script to do GET and PUT method in zendesk API and successfully get the data I wanted and do some updates to the tickets.
below method resulting this ticket number "6442" and put method is intended to remove the tags
from urllib.parse import urlencode
import json
import requests
# Set the credentials
credentials = 'some email', 'some password'
session = requests.Session()
session.auth = credentials
# Set the GET parameters
params_noreply_window = {
'query': 'type:ticket tags:test status<closed',
}
params_oustide_businesshour = {
'query': 'type:ticket tags:send_whatsapp_obh status:new',
}
url_search1 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \
urlencode(params_noreply_window)
url_search2 = 'https://propertypro.zendesk.com/api/v2/search.json?' + \
urlencode(params_oustide_businesshour)
response_noreply_window = session.get(url_search1)
response_oustide_businesshour = session.get(url_search2)
# -----------------------------------------------------------------------------
if response_noreply_window.status_code != 200 | response_oustide_businesshour.status_code != 200:
print('Status 1:', response_noreply_window.status_code + 'Status 2:', response_oustide_businesshour.status_code,
'Problem with the request. Exiting.')
exit()
# Print the subject of each ticket in the results
data_noreply_window = response_noreply_window.json()
data_oustide_businesshour = response_oustide_businesshour.json()
# Ticket to update
# Create a list containing the values of the id field
# for each dictionary that is an element of the list data
id_merged1 = [result['id'] for result in data_noreply_window['results']]
print(type(id_merged1))
print(id_merged1)
id_merged2 = [result['id'] for result in data_oustide_businesshour['results']]
print(type(id_merged2))
print(id_merged2)
# Join value of list by using comma separated
id_merged1_joined = ','.join(map(str, id_merged1))
print(id_merged1_joined)
id_merged2_joined = ','.join(map(str, id_merged2))
print(id_merged2_joined)
# Package the data in a dictionary matching the expected JSON
data_comment1 = {"ticket":
{
"remove_tags": ["test"]
}
}
data_comment2 = {"ticket":
{
"remove_tags": ["send_whatsapp_obh"]
}
}
# Encode the data to create a JSON payload
payload1 = json.dumps(data_comment1)
payload2 = json.dumps(data_comment2)
print("**Start**")
# Set the request parameters
url_put_comments1 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\
'ids=' + id_merged1_joined
url_put_comments2 = 'https://propertypro.zendesk.com/api/v2/tickets/update_many.json?' +\
'ids=' + id_merged2_joined
user = 'some email'
pwd = 'some password'
headers = {'content-type': 'application/json'}
# Do the HTTP put request
response_request_noreply = requests.put(url_put_comments1, data=payload1,
auth=(user, pwd), headers=headers)
response_request_obh = requests.put(url_put_comments2, data=payload2,
auth=(user, pwd), headers=headers)
# Check for HTTP codes other than 200
if response_request_noreply.status_code != 200 | response_request_obh.status_code != 200:
print('Status 1:', response_request_noreply.status_code +
'Status 1:', response_request_obh.status_code,
'Problem with the request. Exiting.')
exit()
# Report success
print('Successfully added comment to tickets')
However, after running my python code and do another GET method, the same ticket number still appears and I need to wait in random time to get the result I intend which is return 'null' since I have updated the ticket by using PUT method.
Can anyone explain me how does the Zendesk API works? and my apology for my incorrect sentences in explaining my concern.

Why raise_for_status() did not catch the error?

Trying to check for none 200 Response in the current_track() function. What could be a problem? It throwing JSONDecodeError error. But if I understood raise_for_ status correctly it should have prevented the function from trying to load a JSON from a faulty web-page? If I run the script without this check and with uncommenting lines check_playback() it successfully catches JSONDecodeError.
The script is fetching data from Spotify and putting it to the status on vk.com.
import config
import webbrowser
import requests
import furl
import secrets
import string
import time
import os
import simplejson as json
URL_CODE_BASE_VK = 'https://oauth.vk.com/authorize'
URL_CODE_BASE_SP = 'https://accounts.spotify.com/authorize'
URL_TOKEN_VK = 'https://oauth.vk.com/access_token'
URL_TOKEN_SP = 'https://accounts.spotify.com/api/token'
URL_TRACK = 'https://api.spotify.com/v1/me/player/currently-playing'
URL_STATUS = 'https://api.vk.com/method/status.set'
EXP_IN_TOKEN_SP = 3400
EXP_IN_TOKEN_VK = 86400
FILE_TOKEN_VK = 'vk_token.json'
FILE_TOKEN_SP = 'sp_token.json'
def get_auth_code_vk():
url_code_params = {
'client_id': config.CLIENT_ID_VK,
'response_type': 'code',
'redirect_uri': 'https://oauth.vk.com/blank.html',
'v': 5.92,
'scope': 'status',
'state': gen_state(),
'display': 'page'
}
code = url_open(URL_CODE_BASE_VK, url_code_params)
return parse_code(code)
def get_auth_code_sp():
url_code_params = {
'client_id': config.CLIENT_ID_SP,
'response_type': 'code',
'redirect_uri': 'https://www.spotify.com/',
'scope': 'user-read-currently-playing',
'state': gen_state()
}
code = url_open(URL_CODE_BASE_SP, url_code_params)
return parse_code(code)
def gen_state():
symbols = string.ascii_lowercase + string.digits
return ''.join(secrets.choice(symbols) for _ in range(12))
def url_open(url_base, url_params):
url_code_full = furl.furl(url_base).add(url_params).url
webbrowser.open_new_tab(url_code_full)
input_url = input('Enter the whole URL, that you have been redirected on: ')
return input_url
def parse_code(url):
return (url.split("code=")[1]).split("&state=")[0]
def get_token_vk():
data = {
'grant_type': 'authorization_code',
'code': get_auth_code_vk(),
'redirect_uri': 'https://oauth.vk.com/blank.html',
'client_id': 6782333,
'client_secret': config.CLIENT_SECRET_VK
}
response = requests.post(url=URL_TOKEN_VK, data=data).json()
write_file(FILE_TOKEN_VK, response)
def get_token_sp():
data = {
'grant_type': 'authorization_code',
'code': get_auth_code_sp(),
'redirect_uri': 'https://www.spotify.com/',
'client_id': config.CLIENT_ID_SP,
'client_secret': config.CLIENT_SECRET_SP
}
response = requests.post(url=URL_TOKEN_SP, data=data).json()
write_file(FILE_TOKEN_SP, response)
def write_file(tkn_file, response):
dict = {}
dict['token'] = response["access_token"]
dict['time'] = time.time()
with open(tkn_file, 'w') as file:
file.write(json.dumps(dict))
def load_file(tkn_file):
with open(tkn_file) as file:
data = json.load(file)
return data
def set_status():
params = {
'v': 5.92,
'access_token': load_file(FILE_TOKEN_VK)['token'],
'text': current_track()
}
set_status = requests.get(url=URL_STATUS, params=params)
def track_data():
tkn_file = load_file(FILE_TOKEN_SP)['token']
headers = {
'Accept': 'application/json',
'Authorization': f'Bearer {tkn_file}'
}
return requests.get(url=URL_TRACK, headers=headers)
def current_track():
response = track_data()
print(response)
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
return "Error: " + str(e)
# data = track_data().json()
data = response.json()
artist = data['item']['artists'][0]['name']
track = data['item']['name']
return(f'{artist} - {track}')
def check_playback():
set_status()
print(current_track())
# try:
# set_status()
# print(current_track())
# except json.decoder.JSONDecodeError:
# print('Not playing')
def token_missing(file):
return not os.path.isfile(file)
def token_expired(file, exp_in):
return time.time() - load_file(file)['time'] > exp_in
def token_not_valid(file, exp_in):
return token_missing(file) or token_expired(file, exp_in)
def run_script():
if token_not_valid(FILE_TOKEN_VK, EXP_IN_TOKEN_VK):
get_token_vk()
if token_not_valid(FILE_TOKEN_SP, EXP_IN_TOKEN_SP):
get_token_sp()
check_playback()
if __name__ == "__main__":
run_script()
Error screen

raise_for_status() will only raise an exception if the server reported an error to you (and even then, only if it actually followed the HTTP spec and returned a HTTP error code).
There is no way for the library to know that the response is incorrect. Even if it was correctly formatted JSON, it can't know what schema you expect it to follow (what fields should be present, and what types those fields should have). Even if it knew the schema and had verified it, there is no way for it to know that the data is actually correct and not made up on the spot.

getting error while fetching full data with my python function?

I am trying to fetch product data from an api.
By default this api returns 20 products and in a single request the api can return max 500 products if we use api's parameter Limit=500.
So for fetching all products we need to use one more parameter with Limit- Offset(Number of products to skip).
I have written following function to achieve this but in case of full data my function is not working well and it's giving me error like- Login failed, Signature mismatching.
def get_data(userid, api_key, action, pagination=True):
timeformat = datetime.datetime.now().replace(microsecond=0).isoformat() + '+08:00'
endpoint = 'https://example.com'
page_json = {}
# set required parameters for this api
parameters = {
'UserID': userid,
'Version': '1.0',
'Action': action,
'Format': 'JSON',
'Timestamp': timeformat
}
if pagination:
page = 0
parameters['Limit'] = 500
while True:
parameters['Offset'] = 500 * page
# set the required cryptographic signature
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
page += 1
try:
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
else:
try:
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
# set the required cryptographic signature
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
return page_json
It looks like I am not fitting my signature parameter line correctly in case of full data.I printed the value of concatenated and it looks like-
page is 1
concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=500&Signature=3d9cd320a4bf816aeea828b9392ed2d5a27cd584b3a337338909c0ab161a101e&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0
try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197313552686ec46f'}}}
page is 2
concatenated:: Action=GetProducts&Format=JSON&Limit=500&Offset=1000&Signature=c1bda1a5ab21c4e4182cc82ca7ba87cb9fc6c5f24c36f9bb006f9da906cf7083&Timestamp=2018-05-26T12%3A58%3A38%2B08%3A00&UserID=contact%40example.com.sg&Version=1.0
try: {'ErrorResponse': {'Head': {'ErrorCode': '7', 'ErrorMessage': 'E7:Login failed. Signature mismatching', 'ErrorType': 'Sender', 'RequestAction': 'GetProducts', 'RequestId': '0bb606c015273197321748243ec3a5'}}}
Can you please look into my function and help me to find out what I have written wrong and what it should be like?

Try this please:
if pagination:
page = 0
parameters['Limit'] = 500
while True:
parameters['Offset'] = 500 * page
# set the required cryptographic signature
concatenated = urllib.parse.urlencode(sorted(parameters.items()))
parameters['Signature'] = HMAC(api_key, concatenated.encode('utf-8'), sha256).hexdigest()
page += 1
try:
response = requests.get(endpoint, params=parameters)
page_json = response.json()
except requests.exceptions.ConnectionError:
print("Connection refused!")
sleep(5)
del parameters['Signature']

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Data mining inside twitter but getting less data - python

Related

How to call multiple functions and execute sql queries from random inputs from JSON?

I get TwitterConnectionError when I repeat requests

Zendesk API search result index

Why raise_for_status() did not catch the error?

getting error while fetching full data with my python function?

Categories

Resources