converting a python script into a function to iterate over each row - python

How can i convert the below python script into a fucntion so that i can call it over each row of a dataframe in which i want to keep few variables dynamic like screen_name, domain
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name="abhi98358", count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=1)
data = data[(data['Date'] > created_time) & (
data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy',
'Problem', 'Outage', 'Service', 'Interruption', 'voice-comms', 'Downtime']
ndata = data[data['Tweets'].str.contains(
"|".join(my_list), regex=True)].reset_index(drop=True)
slack = Slacker('xoxb-34234-44232424-sdkjfksdfjksd')
#message = "test message"
slack.chat.post_message('#ops-twitter-alerts', 'domain :' +' '+ ndata['Tweets'] + '<!channel|>')
my data frame is like below
inp = [{'client': 'epic', 'domain':'fnwp','twittername':'FortniteGame'},{'client': 'epic', 'domain':'fnwp','twittername':'Rainbow6Game'},{'client': 'abhi', 'domain':'abhi','twittername':'abhi98358'}]
df = pd.DataFrame(inp)
I want to iterate over each row one by one like start from scraping the data and send the slack notification and then go to the second row.
I already have gone through How to iterate over rows in a DataFrame in Pandas?

Here you go buddy :-
for index, row in dff.iterrows():
twt=row['twittername']
domain = row['domain']
print(twt)
print(domain)
extractor = twitter_setup()
# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name=twt, count=200)
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
# We add relevant data:
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['text'] = np.array([tweet.text for tweet in tweets])
#data['Date'] = pd.to_datetime(data['Date'], unit='ms').dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
created_time = datetime.datetime.utcnow() - datetime.timedelta(minutes=160)
data = data[(data['Date'] > created_time) & (data['Date'] < datetime.datetime.utcnow())]
my_list = ['Maintenance', 'Scheduled', 'downtime', 'Issue', 'Voice', 'Happy','hound',
'Problem', 'Outage', 'Service', 'Interruption', 'ready','voice-comms', 'Downtime','Patch']
ndata = data[data['Tweets'].str.contains( "|".join(my_list), regex=True)].reset_index(drop=True)
print(ndata)
if len(ndata['Tweets'])> 0:
slack.chat.post_message('#ops-twitter-alerts', domain +': '+ ndata['Tweets'] + '<!channel|>')
else:
print('hi')

Related

Create merged df based on the url list [pandas]

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

Function to extract tweets from user's timeline is looping an

When I run the function, the function brings back around 200 tweets and then loops, until it gets to the total number of tweets the user actually has. So it will repeat the 200 tweets until it reaches that number.
def extract_tweets(userid):
tweets = api.user_timeline(screen_name = userid,
count = 200,
include_rts = True,
tweet_mode = 'extended')
for info in tweets[:3]:
print('ID: {}'.format(info.id))
print(info.created_at)
print(info.full_text)
print('\n')
all_tweets = []
all_tweets.extend(tweets)
oldest_id = tweets[-1].id
tweet_num = api.get_user(userid).statuses_count
while len(all_tweets) < tweet_num:
tweets = api.user_timeline(screen_name = userid,
count = 200,
tweet_mode = 'extended')
if len(tweets) == 1000:
break
oldest_id = tweets[-1].id
all_tweets.extend(tweets)
outtweets = [[tweet.id_str,
tweet.created_at,
tweet.favorite_count,
tweet.retweet_count,
tweet.full_text.encode('utf-8').decode('utf-8')]
for idx, tweet in enumerate(all_tweets)]
df = DataFrame(outtweets, columns =['id', 'created_at', 'favorite_count', 'retweet_count', 'text'])
df.to_csv('%s_tweets.csv' % userid, index = False)
df.head(3)
Your break condition (len(tweets) == 1000) will never be true as the maximum number of tweets return is 200. If you want to exit the loop when all the user tweets are collected, the break condition should be:
if len(tweets) == 0:
break

Extracting tweets using Python and Tweepy

I am trying to look at President Trump's tweets on immigration and do some sentiment analysis on it. My code is:
import pprint
import datetime
# startDate = datetime.datetime(2019, 4, 20, 0, 0, 0)
# endDate = datetime.datetime(2020, 4, 29, 0, 0, 0)
username = "realDonaldTrump"
page = 1
stop_loop = False
finalList1 = []
curs = tweepy.Cursor(api.user_timeline, id=username)
for item in curs.items():
finalList1.append(item)
print(len(finalList1))
data = pd.DataFrame(data=[tweet.text for tweet in finalList1], columns=['Tweets'])
#Add Relevant data
data['len'] = np.array([len(tweet.text) for tweet in finalList1])
data['ID'] = np.array([tweet.id for tweet in finalList1])
data['Date'] = np.array([tweet.created_at for tweet in finalList1])
data['Source'] = np.array([tweet.source for tweet in finalList1])
data['Likes'] = np.array([tweet.favorite_count for tweet in finalList1])
data['RTs'] = np.array([tweet.retweet_count for tweet in finalList1])
#Sentiment Analysis
from textblob import TextBlob
import re
def clean_tweet(tweet):
'''
Utility function to clean the text in a tweet by removing
links and special characters using regex.
'''
return ' '.join(re.sub("(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
def analize_sentiment(tweet):
'''
Utility function to classify the polarity of a tweet
using textblob.
'''
analysis = TextBlob(clean_tweet(tweet))
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])
The code works perfectly fine. However, I have 2 questions:
How do I get access to tweets before these? It gives me 3200 tweets; how do I get the ones before that
How do I get the Donald Trump's tweets which have specific keywords like 'immigration', 'refugee', 'china' etc.
I have been trying to figure out a way but unsuccessful.
for searching for a specific keywords you can use
[API.search][1]
for example:
q="immigration"
searched_tweets = [status for status in tweepy.Cursor(api.search, q=query).items(max_tweets)]
[1]: http://docs.tweepy.org/en/latest/api.html [2]:
Managing Tweepy API Search

Add data to pandas dataframe using for loop with same key to all data dataframe already contains some data

I have stored streaming data of twitter using tweepy.I have extracted name, lang, country, and text from data and stored in one pandas dataframe.
Now I wanted to add gender field to same dataframe which is i am getting from gender api by GEt request by using for loop.
How can i add that gender column to same dataframe?
tweets_data contains all data, I am using nameparser to find first name
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['lang'] = map(lambda tweet: tweet['lang'], tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)
tweets['name'] = map(lambda tweet: tweet['user']['name'], tweets_data)
tweets1=pd.DataFrame()
tweets1['name1'] = map(lambda tweet: tweet['user']['name'], tweets_data)
gender_data=[]
for i,v in tweets.iterrows():
try:
name1 = v['name']
name = HumanName(name1)
PARAMS = {'name':name['first']}
r = requests.get(url = URL, params = PARAMS)
data = r.json()
name = data['name']
gender = data['gender']
gender_data.append(gender)
print(gender_data)
except:
continue
tweets1=pd.DataFrame(gender_data,columns=['gender'])
tweets.merge(tweets1,how='left', left_on='name', right_on='name1')
Pandas allows your to just add the field.
Take the following frame:
my_frame = pd.DataFrame({'name': ['bob', 'jack']})
You can add the gender column like this:
my_frame['gender'] = [1,2]

Read data from OECD API into python (and pandas)

I'm trying to download data from OECD API (https://data.oecd.org/api/sdmx-json-documentation/) into python.
I managed to download data in SDMX-JSON format (and transform it to JSON) so far:
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
"""Make URL for the OECD API and return a response"""
"""4 dimensions: location, subject, measure, frequency"""
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
response = make_OECD_request('MEI'
, [['USA', 'CZE'], [], [], ['M']]
, {'startTime': '2009-Q1', 'endTime': '2010-Q1'})
if (response.status_code == 200):
json = response.json()
How can I transform the data set into pandas.DataFrame? I tried pandas.read_json() and pandasdmx library, but I was not able to solve this.
The documentation the original question points to does not (yet?) mention that the API accepts the parameter contentType, which may be set to csv. That makes it trivial to use with Pandas.
import pandas as pd
def get_from_oecd(sdmx_query):
return pd.read_csv(
f"https://stats.oecd.org/SDMX-JSON/data/{sdmx_query}?contentType=csv"
)
print(get_from_oecd("MEI_FIN/IRLT.AUS.M/OECD").head())
Update:
The function to automatically download the data from OECD API is now available in my Python library CIF (abbreviation for the Composite Indicators Framework, installable via pip):
from cif import cif
data, subjects, measures = cif.createDataFrameFromOECD(countries = ['USA'], dsname = 'MEI', frequency = 'M')
Original answer:
If you need your data in Pandas DataFrame format, it is IMHO better to send your request to OECD with additional parameter 'dimensionAtObservation': 'AllDimensions', which results in more comprehensive JSON file.
Use following functions to download the data:
import requests as rq
import pandas as pd
import re
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
# Make URL for the OECD API and return a response
# 4 dimensions: location, subject, measure, frequency
# OECD API: https://data.oecd.org/api/sdmx-json-documentation/#d.en.330346
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
def create_DataFrame_from_OECD(country = 'CZE', subject = [], measure = [], frequency = 'M', startDate = None, endDate = None):
# Request data from OECD API and return pandas DataFrame
# country: country code (max 1)
# subject: list of subjects, empty list for all
# measure: list of measures, empty list for all
# frequency: 'M' for monthly and 'Q' for quarterly time series
# startDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# endDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# Data download
response = make_OECD_request('MEI'
, [[country], subject, measure, [frequency]]
, {'startTime': startDate, 'endTime': endDate, 'dimensionAtObservation': 'AllDimensions'})
# Data transformation
if (response.status_code == 200):
responseJson = response.json()
obsList = responseJson.get('dataSets')[0].get('observations')
if (len(obsList) > 0):
print('Data downloaded from %s' % response.url)
timeList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'TIME_PERIOD'][0]['values']
subjectList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'SUBJECT'][0]['values']
measureList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'MEASURE'][0]['values']
obs = pd.DataFrame(obsList).transpose()
obs.rename(columns = {0: 'series'}, inplace = True)
obs['id'] = obs.index
obs = obs[['id', 'series']]
obs['dimensions'] = obs.apply(lambda x: re.findall('\d+', x['id']), axis = 1)
obs['subject'] = obs.apply(lambda x: subjectList[int(x['dimensions'][1])]['id'], axis = 1)
obs['measure'] = obs.apply(lambda x: measureList[int(x['dimensions'][2])]['id'], axis = 1)
obs['time'] = obs.apply(lambda x: timeList[int(x['dimensions'][4])]['id'], axis = 1)
obs['names'] = obs['subject'] + '_' + obs['measure']
data = obs.pivot_table(index = 'time', columns = ['names'], values = 'series')
return(data)
else:
print('Error: No available records, please change parameters')
else:
print('Error: %s' % response.status_code)
You can create requests like these:
data = create_DataFrame_from_OECD(country = 'CZE', subject = ['LOCOPCNO'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', startDate = '2009-Q1', endDate = '2010-Q1')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', startDate = '2009-01', endDate = '2010-12')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', subject = ['B6DBSI01'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', subject = ['B6DBSI01'])
You can recover the data from the source using code like this.
from urllib.request import urlopen
import json
URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
response = urlopen(URL).read()
responseDict = json.loads(str(response)[2:-1])
print (responseDict.keys())
print (len(responseDict['dataSets']))
Here is the output from this code.
dict_keys(['header', 'structure', 'dataSets'])
1
If you are curious about the appearance of the [2:-1] (I would be) it's because for some reason unknown to me the str function leaves some extraneous characters at the beginning and end of the string when it converts the byte array passed to it. json.loads is documented to require a string as input.
This is the code I used to get to this point.
>>> from urllib.request import urlopen
>>> import json
>>> URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
>>> response = urlopen(URL).read()
>>> len(response)
9886387
>>> response[:50]
b'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11'
>>> str(response[:50])
'b\'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11\''
>>> str(response[-50:])
'b\'"uri":"http://www.oecd.org/contact/","text":""}]}}\''
I understand that this is not a complete solution as you must still crack into the dataSets structure for the data to put into pandas. It's a list but you could explore it starting with this sketch.
The latest release of pandasdmx (pandasdmx.readthedocs.io) fixes previous issues accessing OECD data in sdmx-json.

Categories