ValueError: Excel does not support datetimes with timezones - python

When I try to run my sreamlit app having function:
def get_tweets(Topic,Count):
i=0
#my_bar = st.progress(100) # To track progress of Extracted tweets
for tweet in tweepy.Cursor(api.search_tweets, q=Topic,count=100, lang="en",exclude='retweets').items():
time.sleep(0.1)
#my_bar.progress(i)
df.loc[i,"Date"] = tweet.created_at
df.loc[i,"User"] = tweet.user.name
df.loc[i,"IsVerified"] = tweet.user.verified
df.loc[i,"Tweet"] = tweet.text
df.loc[i,"Likes"] = tweet.favorite_count
df.loc[i,"RT"] = tweet.retweet_count
df.loc[i,"User_location"] = tweet.user.location
df.to_csv("TweetDataset.csv",index=False)
df.to_excel('{}.xlsx'.format("TweetDataset"),index=False) ## Save as Excel
i=i+1
if i>Count:
break
else:
pass
I get this error:
ValueError: Excel does not support datetimes with timezones. Please ensure that datetimes are timezone unaware before writing to Excel.

Related

How to retrieve only tweets about a hashtag within an hour?

I have been trying to write a python code to use snscrape to retrieve tweets about a hashtag within an hour. But my code has been returning an empty dataframe each time I tried.
This is what I have tried so far:
now = datetime.utcnow()
since = now - timedelta(hours=1)
since_str = since.strftime('%Y-%m-%d %H:%M:%S.%f%z')
until_str = now.strftime('%Y-%m-%d %H:%M:%S.%f%z')
# Query tweets with hashtag #SOSREX in the last one hour
query = '#SOSREX Since:' + since_str + ' until:' + until_str
SOSREX_data = []
SOSREX_data=[]
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
if len(SOSREX_data)>100:
break
else:
SOSREX_data.append([tweet.date,tweet.user.username,tweet.user.displayname,
tweet.content,tweet.likeCount,tweet.retweetCount,
tweet.sourceLabel,tweet.user.followersCount,tweet.user.location
])
# Creating a dataframe from the tweets list above
Tweets_data = pd.DataFrame(SOSREX_data,
columns=["Date_tweeted","username","display_name",
"Tweets","Number_of_Likes","Number_retweets",
"Source_of_Tweet",
"number_of_followers","location"
])
print("Tweets_data")

Why does Twints "since" & "until" not work?

I'm trying to get all tweets from 2018-01-01 until now from various firms.
My code works, however I do not get the tweets from the time range. Sometimes I only get the tweets from today and yesterday or from mid April up to now, but not since the beginning of 2018. I've got then the message: [!] No more data! Scraping will stop now.
ticker = []
#read in csv file with company ticker in a list
with open('C:\\Users\\veron\\Desktop\\Test.csv', newline='') as inputfile:
for row in csv.reader(inputfile):
ticker.append(row[0])
#Getting tweets for every ticker in the list
for i in ticker:
searchstring = (f"{i} since:2018-01-01")
c = twint.Config()
c.Search = searchstring
c.Lang = "en"
c.Panda = True
c.Custom["tweet"] = ["date", "username", "tweet"]
c.Store_csv = True
c.Output = f"{i}.csv"
twint.run.Search(c)
df = pd. read_csv(f"{i}.csv")
df['company'] = i
df.to_csv(f"{i}.csv", index=False)
Does anyone had the same issues and has some tip?
You need to add the configuration parameter Since separately. For example:
c.Since = "2018-01-01"
Similarly for Until:
c.Until = "2017-12-27"
The official documentation might be helpful.
Since (string) - Filter Tweets sent since date, works only with twint.run.Search (Example: 2017-12-27).
Until (string) - Filter Tweets sent until date, works only with twint.run.Search (Example: 2017-12-27).

How to convert argument passed to string with TextBlob?

I'm trying to learn how to scrape tweets using python. I'm trying to use the following code, but I keep getting the error. I'm not sure how to rectify it.
def fetch_tweets(query, count = 50):
api = connect() # Gets the tweepy API object
tweets = [] # Empty list that stores all the tweets
try:
fetched_data = api.search_tweets(q = query + ' -filter:retweets', count = count)
for tweet in fetched_data:
txt = tweet.text
clean_txt = cleanText(txt) # Cleans the tweet
stem_txt = TextBlob(stem(clean_txt)) # Stems the tweet
sent = sentiment(stem_txt) # Gets the sentiment from the tweet
tweets.append((txt, clean_txt, sent))
return tweets
except tweepy.TweepyException as e:
print("Error: "+ str(e))
exit(1)
tweets = fetch_tweets(query = 'Birdman', count = 200)
# Converting the list into a pandas Dataframe
df = pd.DataFrame(tweets, columns= ['tweets', 'clean_tweets','sentiment'])
# Dropping the duplicate values just in case there are some tweets that are copied and then stores the data in a csv file
df = df.drop_duplicates(subset='clean_tweets')
df.to_csv('data.csv', index= False)
ptweets = df[df['sentiment'] == 'positive']
p_perc = 100 * len(ptweets)/len(tweets)
ntweets = df[df['sentiment'] == 'negative']
n_perc = 100 * len(ntweets)/len(tweets)
print(f'Positive tweets {p_perc} %')
print(f'Neutral tweets {100 - p_perc - n_perc} %')
print(f'Negative tweets {n_perc} %')
I keep getting the following error
TypeError: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>
And this is the stem(text) function, which is where the problem seems to occur:
def stem(text):
porter = PorterStemmer()
token_words = word_tokenize(text)
stem_sentence = []
for word in token_words:
stem_sentence.append(porter.stem(word))
return " ".join(stem_sentence)
I saw this response in a different place but since i'm new to coding, I wasn't sure how to use it?
df['data'].apply(lambda x: sentiment(' '.join(x)))

Querying a mongodb raises stop iteration error

Using the following code, I am trying to extract two dates from an object in mongodb and then calculate the difference in time between the two dates -- if both year are in/past 2016. My current code raises the following error:
DeprecationWarning: generator 'QuerySet._iter_results' raised StopIteration
from ipykernel import kernelapp as app
My code:
raw_data = Document.objects()
data = []
for i in raw_data[:10]:
scored = i.date_scored
scored_date = pd.to_datetime(scored, format='%Y-%m-%d %H:%M')
if scored_date == "NoneType":
print("none")
elif scored_date.year >= 2016:
extracted = i.date_extracted
extracted_date = pd.to_datetime(extracted, format='%Y-%m-%d
%H:%M')
bank = i.bank.name
diff = scored_date - extracted_date
print(diff)
datum = [str(bank), str(extracted), str(scored), str(diff)]
data.append(datum)
else:
pass
Any help would be appreciated, thank you!

Speed up parsing Twitter from json to csv (python)

This is my first post so please bear with me.
I have a large (~1GB) json file of Tweets I collected via Twitter's Streaming API. I am able to successfully parse this out into a CSV with the fields I need, however, it is painfully slow - even with the few entities I am extracting (userid, lat/long, and parsing Twitter date string to date/time). What methods could I potentially use to try and speed this up? It currently takes several hours, and I'm anticipating collecting more data....
import ujson
from datetime import datetime
from dateutil import tz
from csv import writer
import time
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60.
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
start_time = time.time()
with open('G:\Programming Projects\GGS 681\dmv_raw_tweets1.json', 'r') as in_file, \
open('G:\Programming Projects\GGS 681\dmv_tweets1.csv', 'w') as out_file:
print >> out_file, 'user_id,timestamp,latitude,longitude'
csv = writer(out_file)
tweets_count = 0
for line in in_file:
tweets_count += 1
tweets = ujson.loads(line)
timestamp = []
lats = ''
longs = ''
for tweet in tweets:
tweet = tweets
from_zone = tz.gettz('UTC')
to_zone = tz.gettz('America/New_York')
times = tweet['created_at']
for tweet in tweets:
times = tweets['created_at']
utc = datetime.strptime(times, '%a %b %d %H:%M:%S +0000 %Y')
utc = utc.replace(tzinfo=from_zone) #comment out to parse to utc
est = utc.astimezone(to_zone) #comment out to parse to utc
timestamp = est.strftime('%m/%d/%Y %I:%M:%S %p') # use %p to differentiate AM/PM
for tweet in tweets:
if tweets['geo'] and tweets['geo']['coordinates'][0]:
lats, longs = tweets['geo']['coordinates'][:2]
else:
pass
row = (
tweets['user']['id'],
timestamp,
lats,
longs
)
values = [(value.encode('utf8') if hasattr(value, 'encode') else value) for value in row]
csv.writerow(values)
end_time = time.time()
print "{} to execute this".format(hms_string(end_time - start_time))
It appears I may have solved this. Looking at the code I was actually running, it looks like my if/else statement below was incorrect.
for tweet in tweets:
if tweets['geo'] and tweets['geo']['coordinates'][0]:
lats, longs = tweets['geo']['coordinates'][:2]
else:
None
I was using else: None, when I should've been using pass or continue. Also, I removed the inner iteration in tweets in my original code. It was able to parse a 60mb file about 4 minutes. Still, if anyone has any tips for making this any faster, I'm open to your suggestions.
Edit: I also used ujson which has significantly increased the speed of loading/dumping the json data from twitter.

Categories