How to update search parameters within a for loop - python

How can i alter the if tweets_per_day == 100 if statement so that when the for loop does its next iteration, the day variable is equal to the prior day? For example, here is what I wish to do:
On the first iteration of the for loop, 100 tweets created within the specified parameters will be appended to the all_tweets list. Currently working properly
Once 100 tweets have been appended, the day count will be updated by 1. Currently working properly
Within the mentioned if statement, as 100 tweets for the day have been appended, the search parameters for the tweets should be updated, by only displaying tweets which were created 1 day less than the previous day. Not working
The for loop restarts with the previous parameters in mind
from datetime import datetime, timedelta
import pandas as pd
import sys
import numpy as np
from datetime import datetime, timedelta, time
import tweepy
from workalendar.usa import NewYork
import re
import pytz
import configparser
# Read Configs
config = configparser.ConfigParser()
config.read('config.ini')
# Twitter API Credentials
api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']
access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']
# Authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
# Make an API object
api = tweepy.API(auth)
stock = input('Enter a stock symbol: ').upper()
stock = '$' + stock # Convert user input to Twitter cashtag
print(stock)
start_date = datetime.now().date()
search_parameters = {
'q': stock,
'lang': 'en',
'count': 100,
'until': start_date,
'tweet_mode': 'ext'
}
cursor = tweepy.Cursor(api.search_tweets, **search_parameters)
# Create a list of market holidays, where the stock exchange is closed
cal = NewYork()
cal.holidays(2023)
# Create a dataframe of stock market holidays
holiday_df = pd.DataFrame(cal.holidays(2023), columns=['Date', 'Holiday'])
holiday_df = holiday_df.drop([0, 3, 4, 9, 10, 11, 12]) # Remove non-market-holidays
# Add additional market holidays
holiday_df.loc[9] = ['2023-04-07', 'Good Friday']
holiday_df.loc[10] = ['2023-06-19', 'Juneteenth']
holiday_df['Date'] = pd.to_datetime(holiday_df['Date'])
tweets_per_day = 0
day_count = 0
dates = [] # empty list to store the dates of appended tweets
all_tweets = [] # empty list to store all tweets being used
tweet_iterator = cursor.items(limit=1000)
for tweet in cursor.items(limit=10000):
day = tweet.created_at
print(f'Start of for loop day value: {day}')
tweet_date = pd.to_datetime(day.date().strftime('%Y-%m-%d'))
# Only add tweets which were made before market open on weekdays
if day.time() < time(hour=9, minute=30) and day.weekday() < 5:
if not holiday_df['Date'].eq(tweet_date).any():
print(f'Tweet date {tweet.created_at.date()}, day date {day.date()}')
while tweets_per_day < 100:
if tweet.created_at.date() != day.date(): # If the day changes before 100 tweets, update day count
day_count += 1
tweets_per_day = 0
print('day != block')
break
all_tweets.append(tweet.text)
tweets_per_day += 1
dates.append(tweet.created_at.date())
print(f'Appended Tweet date: {tweet.created_at}')
print(f'Tweets per day: {tweets_per_day}')
tweet = next(tweet_iterator)
if tweets_per_day == 100:
day_count += 1
search_parameters['until'] -= timedelta(days=1)
cursor = tweepy.Cursor(api.search_tweets, **search_parameters) # create a new Cursor with the updated search parameters
tweet_iterator = cursor.items(limit=1000) # update the tweet iterator with the new Cursor
tweets_per_day = 0
if day_count >= 5:
print(f'Day count: {day_count}')
break

Related

xml with pandas: sum values by condition

recent_cases is supposed to sum the new covid cases in last 10 days for a given location
somehow my code prints None. i cant find the problem
import json
import pandas as pd
import plotly.express as ex
from datetime import *
from datetime import timedelta
class Covid:
dt = timedelta(days=1)
ten_days = timedelta(days=10)
covid_data = pd.read_excel("owid-covid-data.xlsx", usecols="C:F,H,I")
def recent_cases(self, cntry):
today = datetime.today()
temp = today - self.ten_days # 10 days before today
sum_of_cases = 0
for ind in self.covid_data.index:
if temp <= today:
if (self.covid_data["date"][ind] == temp) and (self.covid_data['location'][ind] == cntry):
# if 'date' is temp and 'location' is the location input, sum new cases
sum_of_cases = sum_of_cases + int(self.covid_data["new_cases"][ind])
temp = temp + self.dt # move to the next day
else: # if temp passed today, all past ten days cases are summed
break
if __name__ == '__main__':
c = Covid()
print(c.recent_cases('Italy'))

How can I collect a number of tweets between 2 given dates using tweepy?

For example, I want to collect 20 tweets each day from 1st June to today's date but right now I am only able to view the 20 recent tweets of today's date.
def get_tweets_and_tones_json(searchTerm, NoOfTerms):
output = []
geo = "21.1498134,79.0820556,1045km"
start_date = datetime.date(2020, 6, 1)
end_date = datetime.date.today()
tweets = tw.Cursor(api.search,q=searchTerm,count=NoOfTerms, geocode=geo,lang='en',
since= start_date,until = end_date).items(20)
for i,tweet in enumerate(tweets):
tweetedText = tweet.text
jsonFile1 = analyze_tone(tweetedText)
x = tweet.created_at
print(tweetedText,'\n', x)

getting an empty array from a request to eventful api

Im trying to use the eventful api to get information about only music events (concerts) between two dates. For example I want to get the below information about each concert from 20171012 to 20171013:
- city
- performer
- country
- latitude
- longitude
- genre
- title
- image
- StarTime
Im using a python example available online and change it to get the data above. But for now its not working Im just able to get this information:
{'latitude': '40.4',
'longitude': '-3.68333',
'start_time': '2017-10-12 20:00:00',
'city_name': 'Madrid', 'title': 'Kim Waters & Maysa Smooth en Hot Jazz Festival'}
But the performer, genre country and image url its not working. Do you know how to get that information? When I change the python example below to get this information it returns always a empty array.
python example working: (However, without getting the performer, genre, country and image url, if I add theese elements to the event_features I get an empty array)
import requests
import datetime
def get_event(user_key, event_location , start_date, end_date, event_features, fname):
data_lst = [] # output
start_year = int(start_date[0:4])
start_month = int(start_date[4:6])
start_day = int(start_date[6:])
end_year = int(end_date[0:4])
end_month = int(end_date[4:6])
end_day = int(end_date[6:])
start_date = datetime.date(start_year, start_month, start_day)
end_date = datetime.date(end_year, end_month, end_day)
step = datetime.timedelta(days=1)
while start_date <= end_date:
date = str(start_date.year)
if start_date.month < 10:
date += '0' + str(start_date.month)
else:
date += str(start_date.month)
if start_date.day < 10:
date += '0' + str(start_date.day)
else:
date += str(start_date.day)
date += "00"
date += "-" + date
url = "http://api.eventful.com/json/events/search?"
url += "&app_key=" + user_key
url += "&location=" + event_location
url += "&date=" + date
url += "&page_size=250"
url += "&sort_order=popularity"
url += "&sort_direction=descending"
url += "&q=music"
url+= "&c=music"
data = requests.get(url).json()
try:
for i in range(len(data["events"]["event"])):
data_dict = {}
for feature in event_features:
data_dict[feature] = data["events"]["event"][i][feature]
data_lst.append(data_dict)
except:
pass
print(data_lst)
start_date += step
def main():
user_key = ""
event_location = "Madrid"
start_date = "20171012"
end_date = "20171013"
event_location = event_location.replace("-", " ")
start_date = start_date
end_date = end_date
event_features = ["latitude", "longitude", "start_time"]
event_features += ["city_name", "title"]
event_fname = "events.csv"
get_event(user_key, event_location, start_date, end_date, event_features, event_fname)
if __name__ == '__main__':
main()
You should debug your problem and not to ignore all exceptions.
Replace lines try: ... except: pass by:
data = requests.get(url).json()
if "event" in data.get("event", {}):
for row in data["events"]["event"]:
# print(row) # you can look here what are the available data, while debugging
data_dict = {feature: row[feature] for feature in features}
data_lst.append(data_dict)
else:
pass # a problem - you can do something here
You will see a KeyError with a name of the missing feature that is not present in "row". You should fix missing features and read documentation about API of that service. Country feature is probably "country_name" similarly to "city_name". Maybe you should set the "include" parameter to specify more sections of details in search than defaults only.
An universal try: ... except: pass should never used, because "Errors should never pass silently." (The Zen of Python)
Read Handling Exceptions:
... The last except clause may omit the exception name(s), to serve as a wildcard. Use this with extreme caution, since it is easy to mask a real programming error in this way! ...
A more important command where unexpected exceptions are possible is requests.get(url).json(), e.g. TimeoutException. Anyway you should not continue the "while" loop if there is a problem.
If you look at the data returned by eventful.com, a few things are clear:
For country, the field to be used is country_name. This was missing from your "event_features" list
There can be multiple performers for each event. To get all the performers, you need to add "performers" to your "event_features" list
There is no field named Genre and hence you cannot find Genre
The "image" field is always None. This means there is no image available.
Here is modified code. Hopefully it works much better and it will help you move forward.
import datetime
import requests
data_lst = [] # output
event_features = ["latitude", "longitude", "start_time", "city_name",
"country_name", "title", "image", "performers"]
def get_event(user_key, event_location, start_date, end_date):
start_year = int(start_date[0:4])
start_month = int(start_date[4:6])
start_day = int(start_date[6:])
end_year = int(end_date[0:4])
end_month = int(end_date[4:6])
end_day = int(end_date[6:])
start_date = datetime.date(start_year, start_month, start_day)
end_date = datetime.date(end_year, end_month, end_day)
step = datetime.timedelta(days=1)
while start_date <= end_date:
date = str(start_date.year)
if start_date.month < 10:
date += '0' + str(start_date.month)
else:
date += str(start_date.month)
if start_date.day < 10:
date += '0' + str(start_date.day)
else:
date += str(start_date.day)
date += "00"
date += "-" + date
url = "http://api.eventful.com/json/events/search?"
url += "&app_key=" + user_key
url += "&location=" + event_location
url += "&date=" + date
url += "&page_size=250"
url += "&sort_order=popularity"
url += "&sort_direction=descending"
url += "&q=music"
url += "&c=music"
data = requests.get(url).json()
print "==== Data Returned by eventful.com ====\n", data
try:
for i in range(len(data["events"]["event"])):
data_dict = {}
for feature in event_features:
data_dict[feature] = data["events"]["event"][i][feature]
data_lst.append(data_dict)
except IndexError:
pass
print "===================================="
print data_lst
start_date += step
def main():
user_key = "Enter Your Key Here"
event_location = "Madrid"
start_date = "20171012"
end_date = "20171013"
event_location = event_location.replace("-", " ")
start_date = start_date
end_date = end_date
#event_fname = "events.csv"
get_event(user_key, event_location, start_date, end_date)
if __name__ == '__main__':
main()
I was able to successfully pull data from the Eventful API for the performer, image, and country fields. However, I don't think the Eventful Search API supports genre - I don't see it in their documentation.
To get country, I added "country_name", "country_abbr" to your event_features array. That adds these values to the resulting JSON:
'country_abbr': u'ESP',
'country_name': u'Spain'
Performer also can be retrieved by adding "performers" to event_features. That will add this to the JSON output:
'performers': {
u'performer': {
u'name': u'Kim Waters',
u'creator': u'evdb',
u'url': u'http://concerts.eventful.com/Kim-Waters?utm_source=apis&utm_medium=apim&utm_campaign=apic',
u'linker': u'evdb',
u'short_bio': u'Easy Listening / Electronic / Jazz', u'id': u'P0-001-000333271-4'
}
}
To retrieve images, add image to the event_features array. Note that not all events have images, however. You will either see 'image': None or
'image': {
u'medium': {
u'url': u'http://d1marr3m5x4iac.cloudfront.net/store/skin/no_image/categories/128x128/other.jpg',
u'width': u'128',
u'height': u'128'
},
u'thumb': {
u'url': u'http://d1marr3m5x4iac.cloudfront.net/store/skin/no_image/categories/48x48/other.jpg',
u'width': u'48',
u'height': u'48'
}
}
Good luck! :)

How to check limit range from csv file in python script

I am trying to fetch some values from database and need to check some lower and upper limits of a variables which are store in a text file like this and they are separated by \t. the text file looks like
Variable lower_limit upper_limit
temperature 20 40
pressure 0 100
temperature2 0 30
temperature3 20 25
and the data in database looks like
usec temperature_data temperature2_data
1456411800 25 15
1456412400 45 25
1456413000 28 19
So i start with checking first whether the variable is in the text file, if yes then i would need to check the limits of that variable. until now i am only successful in verifying the name of the variable, but i am unable to check the limits.
my code is as follow
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import datetime as dt
import sys
import time
import datetime
import calendar
import numpy as np
import mysql.connector
import datetime
import numpy as np
import pandas as pd
import mysql.connector
from mysql.connector import errorcode
# starting day, month and year
start_day = dt.datetime(2016, 02, 25)
# total number of dates to visualize
number = 11
num_total = 11
# enter limit range
upper_limit = 250 # these are hardcode values which i want to
replace and instead of hard code , i want to
check these limits values from the text file
lower_limit = 0
# start day in epoch time format
start_time = 1456411800
# variable name and filepath
filepath = '/home/robbyy/files/limit.txt'
vari_name = 'temperature2'
# database name, user and password details and query to fetch respective data
usr = 'roby'
password = 'xxxx'
db_name = 'roby_data'
insert_query = ("SELECT usec , temperature2_data "
"FROM rob_table WHERE usec >= %s "
"AND usec <= %s")
def generate_data():
num = num_total
cnx = mysql.connector.connect(user=usr, password=password,
database=db_name)
cursor = cnx.cursor()
query = insert_query
for i in range(number):
current_start_ts = (start_time + (i*86400))
current_day = datetime.datetime.fromtimestamp(current_start_ts)
# print 'cd: ', current_day
current_end_ts = (start_time + (i*86400)) + 86399
cursor.execute(query, (current_start_ts * 1000000,
current_end_ts * 1000000))
rows = cursor.fetchall()
rows_arr = np.array(rows)
# print 'rows all here li: ', rows
with open(filepath, 'r') as f:
limit_file = f.read()
limits = {}
for line in limit_file.splitlines():
print 'line to see:', line
variable, lower, upper = line.split()
if not variable == 'Variable':
limits[variable] = {'lower': int(lower),
'upper': int(upper)}
print 'limits: ', limits
if vari_name in data:
pass
if len(rows_arr) == 0:
continue
# print 'no data is here'
else:
for item, index in rows_arr:
if index >= upper_limit or index <= lower_limit:
print 'data exceeds limit: ', index
else:
continue
# print 'data is within range: ', index
else:
print 'sorry: this variable name is invalid'
start = start_day
dates = [start + dt.timedelta(days=i) for i in range(num)]
return dates
def main():
dates = generate_data()
main()
If someone helps me or guide me how to check the lower and upper limits from the text file for the required variable instead of giving hard coded values in the script. i would be grateful
thanks
just parse the limits file and for example create a dict out of it. Something like this.
def parse_limits(file):
with open(file, 'r') as f:
limit_file = f.read()
limits = {}
for line in limit_file.splitlines():
variable, lower, upper = line.split()
if not variable == 'Variable':
limits[variable] = {'lower': int(lower),
'upper': int(upper)}
return limits
That would result in a nested dict as follows:
{
'pressure': {'upper': 100, 'lower': 0},
'temperature2': {'upper': 30, 'lower': 0},
'temperature': {'upper': 40, 'lower': 20},
'temperature3': {'upper': 25, 'lower': 20}
}
Edit:
As requested your final code might look s.th. like this:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import datetime as dt
import sys
import time
import datetime
import calendar
import numpy as np
import mysql.connector
import datetime
import numpy as np
import pandas as pd
import mysql.connector
from mysql.connector import errorcode
# starting day, month and year
start_day = dt.datetime(2016, 02, 25)
# total number of dates to visualize
number = 11
num_total = 11
# enter limit range
upper_limit = 250 # these are hardcode values which i want to
replace and instead of hard code , i want to
check these limits values from the text file
lower_limit = 0
# start day in epoch time format
start_time = 1456411800
# variable name and filepath
filepath = '/home/robbyy/files/limit.txt'
vari_name = 'temperature2'
# database name, user and password details and query to fetch respective data
usr = 'roby'
password = 'xxxx'
db_name = 'roby_data'
insert_query = ("SELECT usec , temperature2_data "
"FROM rob_table WHERE usec >= %s "
"AND usec <= %s")
def parse_limits(file):
with open(file, 'r') as f:
limit_file = f.read()
limits = {}
for line in limit_file.splitlines():
variable, lower, upper = line.split()
if not variable == 'Variable':
limits[variable] = {'lower': int(lower),
'upper': int(upper)}
return limits
limits = parse_limits(filepath)
def generate_data():
num = num_total
cnx = mysql.connector.connect(user=usr, password=password,
database=db_name)
cursor = cnx.cursor()
query = insert_query
for i in range(number):
current_start_ts = (start_time + (i*86400))
current_day = datetime.datetime.fromtimestamp(current_start_ts)
# print 'cd: ', current_day
current_end_ts = (start_time + (i*86400)) + 86399
cursor.execute(query, (current_start_ts * 1000000,
current_end_ts * 1000000))
rows = cursor.fetchall()
rows_arr = np.array(rows)
# print 'rows all here li: ', rows
print 'limits: ', limits
if vari_name in data:
if len(rows_arr) == 0:
continue
# print 'no data is here'
else:
for item, index in rows_arr:
if index >= limits[vari_name]['upper'] or
index <= limits[vari_name]['lower']:
print 'data exceeds limit: ', index
else:
continue
# print 'data is within range: ', index
else:
print 'sorry: this variable name is invalid'
start = start_day
dates = [start + dt.timedelta(days=i) for i in range(num)]
return dates
def main():
dates = generate_data()
main()

Data generation incomplete: Python random

I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)

Categories