I am new to Python and programing in general.
I wrote this script and it runs with out error but is doesn't print any content in the .csv even though I know there is content to print. I have been stuck for a day or 2 and need some help.
import sys
import json
import urllib
import oauth2 as oauth
import requests
import time
import csv
CONSUMER_KEY = ""
CONSUMER_SECRET = ""
ACCESS_KEY = ""
ACCESS_SECRET = ""
consumer = oauth.Consumer(key=CONSUMER_KEY, secret=CONSUMER_SECRET)
access_token = oauth.Token(key=ACCESS_KEY, secret=ACCESS_SECRET)
client = oauth.Client(consumer, access_token)
html ="https://api.twitter.com/1.1/search/tweets.json?q=#gmail.com"
response, data = client.request(html)
f = open("twitter_gmail.csv", 'a')
handle_tweet =json.loads(data)
def handle_tweet(self, data):
search_terms = ['#gmail.com']
text = message.get('text')
words = text.split()
matches = []
for term in search_terms:
match = [word for word in words if term in word]
matches.append(match)
f.write('%s,%s,%s,%s\n' % (message.get('created_at'), message.get('text'), message.get('user').get('id'),matches))
It looks like you are importing the csv module without even using it. There is probably something funky with your f.write statement, but things will be much easier for you if you try to write to your file using csv.writer. The csv.writer can easily take in a list, and spit out a comma separated line of the values in the list. I'd recommend reading its documentation in order to implement it.
You're pretty close already. I'm not exactly sure what you want on each row, but the following might be close to your intended goal.
f = open("twitter_gmail.csv", 'a')
# This next line makes the csv writer.
writer = csv.writer(f)
handle_tweet =json.loads(data)
def handle_tweet(self, data):
search_terms = ['#gmail.com']
text = message.get('text')
words = text.split()
matches = []
for term in search_terms:
match = [word for word in words if term in word]
matches.append(match)
# The next line is how you write with a csv writer.
writer.writerows(matches)
If I'm reading your code correctly, matches is a list of lists. I've used csv.writer's writerows method which will put each list on its own line (notice how it's plural, as opposed to writerow, which expects a single list and writes it to a single line).
I don't have Twitter auth keys, so can't test, but should work based on Twitter's API documentation.
If you are doing serious work (as opposed to figuring out how it is done), you should probably use a tested module like twitter or python-twitter.
import csv
import json
import oauth2 as oauth
import urllib
# I don't if any of these are actually needed?
import sys
import requests
import time
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_KEY = ''
ACCESS_SECRET = ''
class TwitterSearch:
def __init__(self,
ckey = CONSUMER_KEY,
csecret = CONSUMER_SECRET,
akey = ACCESS_KEY,
asecret = ACCESS_SECRET,
query = 'https://api.twitter.com/1.1/search/tweets.{mode}?{query}'
):
consumer = oauth.Consumer(key=ckey, secret=csecret)
access_token = oauth.Token(key=akey, secret=asecret)
self.client = oauth.Client(consumer, access_token)
self.query = query
def search(self, q, mode='json', **queryargs):
queryargs['q'] = q
query = urllib.urlencode(queryargs)
return self.client.request(self.query.format(query=query, mode=mode))
def write_csv(fname, rows, header=None, append=False, **kwargs):
filemode = 'ab' if append else 'wb'
with open(fname, filemode) as outf:
out_csv = csv.writer(outf, **kwargs)
if header:
out_csv.writerow(header)
out_csv.writerows(rows)
def main():
ts = TwitterSearch()
response, data = ts.search('#gmail.com', result_type='recent')
js = json.loads(data)
# This _should_ work, based on sample data from
# https://dev.twitter.com/docs/api/1.1/get/search/tweets
messages = ([msg['created_at'], msg['txt'], msg['user']['id']] for msg in js.get('statuses', []))
write_csv('twitter_gmail.csv', messages, append=True)
if __name__ == "__main__":
main()
Related
I'm trying to tokenize all my tweets that I previously saved in json file. I follow this example: https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
import re
import json
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:#[\w_]+)', # #-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
When I add this at the end everything is working:
tweet = 'RT #marcobonzanini: just an example! :D http://example.com #NLP'
print(preprocess(tweet))
I want to tokenize my tweets that I saved in JSON file and the website suggests to do it this way:
with open('mytweets.json', 'r') as f:
for line in f:
tweet = json.loads(line)
tokens = preprocess(tweet['text'])
do_something_else(tokens)
This is how I'm trying to open my JSON file:
with open('data/digitalhealth.json', 'r') as f:
... for line in f:
... tweet = json.loads(line)
... tokens = preprocess(tweet['text'])
... do_something_else(tokens)
And this is what python returns:
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
TypeError: list indices must be integers, not str
Does anyone know how to sort this out? I'm new to all this and I really don't have any idea what to do.
This is my code for collecting Data from Twitter's API:
import tweepy
import json
API_KEY = 'xxx'
API_SECRET = 'xxx'
TOKEN_KEY = 'xxx'
TOKEN_SECRET = 'xxx'
auth = tweepy.OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(TOKEN_KEY, TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)
query = '#digitalhealth'
cursor = tweepy.Cursor(api.search, q=query, lang="en")
for page in cursor.pages():
tweets = []
for item in page:
tweets.append(item._json)
with open('Twitter_project/digitalhealth.json', 'wb') as outfile:
json.dump(tweets, outfile)
How do I change it now so I will have only dictionaries?
Thanks all of you for your help! I really appreciate it
For some reason you're storing your JSON dictionaries in lists... You should try to store them as dictionaries since that'd be much easier for you, but if you want to access them now then simply do: tweet[0] to access the dictionary, and from there you can access the dictionary data like so tweet[0]['text']. Still, look into reformatting the JSON properly.
I want every time the bot is mentioned the bot to reply with a certain text and picture. e.g. text is "Winnie the Pooh" and the pic is of winnie the pooh. I've stored the images and texts in a json file. The error I get is this:
api.update_status("#"+tweet.user.screen_name+data[n]["text"]+media_ids=media_list,tweet.id)
^
SyntaxError: expression cannot contain assignment, perhaps you meant "=="?
I just can't figure out how to write a reply that include a text and an image. Also the lastseen.txt file is the id of the tweet that has last mentioned the bot.
import tweepy
import time
import random
import json
consumer_key=''
consumer_secret=''
key=''
secret=''
auth=tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(key, secret)
api = tweepy.API(auth)
with open('pll/file.json') as json_file:
data = json.load(json_file)
randomlist = [0,1,2,3,4]
for i in range(0,4):
n = random.randint(0,4)
randomlist.append(n)
media_list = list()
response = api.media_upload(data[n]["image"])
media_list.append(response.media_id_string)
FILE_NAME = 'pll/lastseen.txt'
def read_last_seen(FILE_NAME):
file_read = open(FILE_NAME, 'r')
last_seen_id = int(file_read.read().strip())
file_read.close()
return last_seen_id
def store_last_seen(FILE_NAME, last_seen_id):
file_write=open(FILE_NAME, 'w')
file_write.write(str(last_seen_id))
file_write.close()
return
def reply():
tweets=api.mentions_timeline(read_last_seen(FILE_NAME), tweet_mode='extended')
for tweet in reversed(tweets):
print(str(tweet.id) + ' _ ' + tweet.full_text)
api.update_status("#"+tweet.user.screen_name+data[n]["text"]+media_ids=media_list,tweet.id)
store_last_seen(FILE_NAME, tweet.id)
reply()
I have a text file called tokens.txt.
Ex: 12463,4126,6343,6345.
And i want to send a post request with each tokens and use multi threading.
For some reasons my code only gets the last token from the txt file and only uses that.
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
url_list = [
"https://www.google.com/api/"
]
file_lines = open("tokens.txt", "r").readlines()
for line in file_lines:
tokens = {
'Token':line.replace('/n','')
}
def makerequest(url):
while True:
html = requests.post(url,stream=True, data=tokens)
print(tokens)
return html.content
start = time()
processes = []
with ThreadPoolExecutor(max_workers=200) as executor:
for url in url_list:
processes.append(executor.submit(makerequest, url))
for task in as_completed(processes):
print(task.result())
print(f'Time taken: {time() - start}')
How can i send for each token a request?
In your case tokens = {"Token": <last_token>}
Modify your code like this so that for each token one request can be sent.
tokens = set()
'''
<- You can use list also but in this case set is better as it will ensure only
one request for one token even if your tokens file contains duplicate line.
'''
url_list = [
"https://www.google.com/api/"
]
tokens = set()
with open("tokens.txt", "r") as f:
file_lines = f.readlines()
for line in file_lines:
tokens.add(line.strip())
token_data = {"Token": None}
def makerequest(url):
for token in tokens:
token_data["Token"] = token
html = requests.post(url,stream=True, data=token_data)
print(token)
# do something with html here
# don't return or break
You are doing
data = tokens
and at that point tokens is the assignment from the last line. If you want all tokens, you need to do something likej:
tokens = set()
for line file_lines:
tokens.add(......)
The problem with your code is the creation of the tokens dictionary - you loop ofer the tokens but you alway overwrite the value mapped to the "Token" key.
Moreover there are a few bad practices in you code.
please be careful with the inline opening of files like you did
file_lines = open("tokens.txt", "r").readlines()
Rather use it as a context manager
with open("tokens.txt", "r") as file:
file_lines = file.readlines()
This makes sure that the file gets closed again after you read it - in your case you would need to make sure that the file gets closed (even in a crash etc.)
Secondly, avoid using global variables in functions. According to you code I assume that you want to query the different urls with each token - so the fucntion should accept both as arguments. Respectively i would then create a list of combinations like
url_token_combs = [(url, token.strip()) for url in url_list for token in file_lines]
And finally, change your function to use the arguements handed to it rather than global ones like:
def makerequest(url_token ):
url , token = url_token
html = requests.post(url,stream=True, data=token)
return html.content
That allows you now to loop over your code with thread like:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from time import time
def makerequest(url_token):
url , token = url_token
html = requests.post(url,stream=True, data=tokens)
print(tokens)
return html.content
if __name__ == "__main__":
start = time()
url_list = [
"https://www.google.com/api/"
]
with open("tokens.txt", "r") as file:
file_lines = file.readlines()
tokens = [{'Token':line.replace('/n','') }for line in file_lines ]
url_tokens = [(url, token.strip()) for url in url_list for token in tokens]
processes = []
with ThreadPoolExecutor(max_workers=200) as executor:
for url_token in url_tokens:
processes.append(executor.submit(makerequest, url_token))
for task in as_completed(processes):
print(task.result())
print(f'Time taken: {time() - start}')
I'm trying to crawl down tweets from Netherlands through twitter api, but it returns an empty file and does not show any bug. I know people in Netherlands speak English and use twitter so it does not make sense. Here is my code:
The set up part (should be fine):
import tweepy
import csv
import json
import pandas as pd
from pprint import pprint
key_file = 'keys.json'
# Loading your keys from keys.json (which you should have filled
# in in question 1):
with open(key_file) as f:
keys = json.load(f)
# if you print or view the contents of keys be sure to delete the cell!
consumer_key = keys["consumer_key"]
consumer_secret = keys["consumer_secret"]
access_token = keys["access_token"]
access_token_secret = keys["access_token_secret"]
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
Here is the function I wrote to get data:
def tweet_obtainer(file_name,key_word = None, language = None, tweet_since, tweet_until
, geo = None, place = None, tweet_mode = 'extended', retweet = 'false', number_of_tweet = 100):
ds_tweets_save_path = file_name + '.json'
example_tweets = [t._json for t in tweepy.Cursor(api.search, q= key_word,
lang= language,
since= tweet_since,
until= tweet_until,
geocode = geo,
place= place,
tweet_mode= tweet_mode,
retweet = retweet#-filter:nativeretweets
).items(number_of_tweet)]#number of tweets
with open(ds_tweets_save_path, "w") as f:
json.dump(example_tweets, f)
with open(ds_tweets_save_path, "r") as f:
example_tweets = json.load(f)
return None
Here is the line where I implement it on the Netherlands and then I get an empty file which is blizzard:
tweet_obtainer("data_netherlands", ["is"],"en","2018-06-01",
"2018-06-14",geo = "52.1326, 5.2913, 131km"
)
Any English tweet should contain "is" so It should not return empty file!
I figured it out! It is because my geocode string has space in it. There should be no space but only comma between lat, long and radius.
I am working on pulling logs through an web API and so far when pulling the logs they return in the following format (3 events below starting with and ending with . My question is what would be the best way to loop through each line and concatenate them so that the result event looks like below.
Current output
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.126</client_ip>
<date>1363735940</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8980</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.136</client_ip>
<date>1363735971</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8981</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.156</client_ip>
<date>1363735975</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8982</reference_id>
</attack_headline>
</attack_headlines>
Expected output
<attack_headlines version="1.0.1"><attack_headline><site_id>1</site_id<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category<subcategory>SUlTIEhlbHA=</subcategory><client_ip>172.17.1.156</client_ip<date>1363735975</date><gmt_diff>0</gmt_diff<reference_id>6D13-DE3D-9539-8982</reference_id></attack_headline</attack_headlines>
Thanks in advance!
import json
import os
from suds.transport.https import WindowsHttpAuthenticated
class Helpers:
def set_connection(self,conf):
#SUDS BUG FIXER(doctor)
protocol=conf['protocol']
hostname=conf['hostname']
port=conf['port']
path=conf['path']
file=conf['file']
u_name=conf['login']
passwrd=conf['password']
auth_type = conf['authType']
from suds.xsd.doctor import ImportDoctor, Import
from suds.client import Client
url = '{0}://{1}:{2}/{3}/{4}?wsdl'.format(protocol,
hostname,port, path, file)
imp = Import('http://schemas.xmlsoap.org/soap/encoding/')
d = ImportDoctor(imp)
if(auth_type == 'ntlm'):
ntlm = WindowsHttpAuthenticated(username=u_name, password=passwrd)
client = Client(url, transport=ntlm, doctor=d)
else:
client = Client(url, username=u_name, password=passwrd, doctor=d)
return client
def read_from_file(self, filename):
try:
fo = open(filename, "r")
try:
result = fo.read()
finally:
fo.close()
return result
except IOError:
print "##Error opening/reading file {0}".format(filename)
exit(-1)
def read_json(self,filename):
string=self.read_from_file(filename)
return json.loads(string)
def get_recent_attacks(self, client):
import time
import base64
from xml.dom.minidom import parseString
epoch_time_now = int(time.time())
epochtimeread = open('epoch_last', 'r')
epoch_time_last_read = epochtimeread.read()
epochtimeread.close()
epoch_time_last = int(float(epoch_time_last_read))
print client.service.get_recent_attacks("",epoch_time_last,epoch_time_now,1,"",15)
If this is just a single, large string object with line-breaks, you can simply delete them:
import re
text = re.sub('\s*\n\s*', '', text)
To leave the line breaks in that follow the </attack_headline> delimiter, try:
re.sub('(?<!<\/attack_headline>)\s*\n\s*', '', x)
You could use:
oneline = "".join(multiline.split())
Edit 1 (I've just seen your edit) - I will change your code like this:
with open(filename, "r") as fo:
result = []
for line in fo.readlines():
result.append(line.strip())
return result
Edit 2 (I've read your comment on the other answer) - You could do like this:
with open(filename, "r") as fo:
partial = []
for line in fo.readlines():
if line.startswith("<"):
yield "".join(partial)
partial = []
else:
clean = line.strip()
if clean:
partial.append(clean)
import re
# remove all newline whitespace stuff as in answer given before:
text = re.sub(r'\s*\n\s*', '', text)
# break again at desired points:
text = re.sub(r'</attack_headlines>', '</attack_headlines>\n', text)