Saving dictionary of tweets into JSON file results in an empty dictionary

Saving dictionary of tweets into JSON file results in an empty dictionary - python

I am trying to collect some localized tweets and store them on my hard drive as a dictionary of tweets. In some iterations in the fetchsamples function, the saved dictionary is forced into empty state despite the fact that during the for loop data is added into the dictionary (see output below).
I have tried different encodings or passing "w" and "wb" flags to my save function but it didn't help.
I tried reproducing this using random strings (to allow people easier checking of my code) but I was unable to. I am unsure what in the tweet structure or my code is causing this behaviour.
NOTE: I have added a code snippet to catch when the dictionary is forced into empty state for debugging.
import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os
api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
return response
def fetchsamples():
url = "https://stream.twitter.com/1/statuses/sample.json"
url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
parameters = []
response = twitterreq(url, "GET", parameters)
data = {}
count = 1
for line in response:
try:
strip = json.loads(line.strip())
if strip['coordinates'] != None:
data[count] = strip
count += 1
if count % 10 == 0:
print count, len(data.keys())
except Exception as e:
# Print error and store in a log file
print e
with open("/Temp/Data/error.log","w") as log:
log.write(str(e))
# If 100 tweets have passed save the file
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
# This code is for debug purposes to catch when dictionary
# when dictionary is forced into empty state
if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
print "After saving: ", len(data.keys())
return data
else:
data = {}
data = fetchsamples()
This produces the following output with no error. The data dictionary is empty.
100 99
Before saving: 99
110 10
120 20
130 30
140 40
150 50
160 60
170 70
180 80
190 90
200 100
Before saving: 100
Before saving: 0
After saving: 0

The dictionary is empty because after each 100 iterations, you either set data={} or the dictionary is already empty. If i understand correctly, you would need another dictionary, one which you never empty, and push items to that dictionary also.
import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os
api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
return response
def fetchsamples():
url = "https://stream.twitter.com/1/statuses/sample.json"
url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
parameters = []
response = twitterreq(url, "GET", parameters)
data = {}
allData = {}
count = 1
for line in response:
try:
strip = json.loads(line.strip())
if strip['coordinates'] != None:
data[count] = strip
allData[count] = strip
count += 1
if count % 10 == 0:
print count, len(data.keys())
except Exception as e:
# Print error and store in a log file
print e
with open("/Temp/Data/error.log","w") as log:
log.write(str(e))
# If 100 tweets have passed save the file
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
# Return data if the file is empty and stop
if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
print "After saving: ", len(data.keys())
return allData
else:
data = {}
data = fetchsamples()

The problem is in the way I incremented the count value. Since count is incremented only when strip["coordinates"] != None if I receive a tweet where strip["coordinates"] == None the count value does not get incremented but data = {} and count % 100 == 0 gives True, which means the original non-empty file is replaced with an empty one.
The solution is to increment count after saving such as here:
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
count += 1

Related

Loop Through IP Address Range to Check Printer Status in HTML - Python & BeuatifulSoup

I'm trying to loop through a few IP addresses which are printer to make sure the status is READY and I'm not sure my code is actually looping through each one. The code should print Status: READY the first time the code runs then after every 2 mins it will check again if nothing changed then it will print "Nothing Changed" otherwise it will print the Status.
# Scale1 Ticket Printer 10.56.32.247
# Scale2 Ticket Printer 10.56.32.248
# Scale3 Ticket Printer 10.56.32.246
import sys
import requests
from bs4 import BeautifulSoup
import time
def main():
result = []
for ip in range(246, 248):
resp = requests.get(f"http://10.56.32.%d" % ip)
result.extend(resp)
txt = resp.text
soup = BeautifulSoup(txt, 'lxml')
status = soup.find_all('h3')[0].text
return status
res_before = ""
while True:
res = main()
if res != res_before:
#print(res)
res_before = res
else:
print("nothing changed")
for i in range(120):
msg = "Pausing for 2 minutes..."
sys.stdout.write("\r{} {} seconds ".format(msg, i))
time.sleep(1)
sys.stdout.flush()
The first time the code runs it should print Status: READY but it's just printing nothing changed.
Here are the results from the code. Thank you in advanced for any help it's much appreciated.
nothing changed
Pausing for 2 minutes... 119 seconds nothing changed

You need to return e.g. a dict with a key/value for each IP address.
from bs4 import BeautifulSoup
import requests
import time
def get_statuses():
results = {}
for ip in range(246, 248):
resp = requests.get(f"http://10.56.32.%d" % ip)
if resp.status_code != 200:
results[ip] = f"Error {resp.status_code}"
else:
txt = resp.text
soup = BeautifulSoup(txt, "lxml")
results[ip] = soup.find_all("h3")[0].text
return results
def main():
old_statuses = None
while True:
new_statuses = get_statuses()
if old_statuses != new_statuses:
print("Status:", new_statuses)
old_statuses = new_statuses
print("Checking again in 2 minutes.")
time.sleep(120)
if __name__ == "__main__":
main()
To print only changed statuses, you could do something like
old_statuses = {}
while True:
new_statuses = get_statuses()
for key, value in new_statuses.items():
if value != old_statuses.get(key):
print("Changed:", key, value)
old_statuses = new_statuses
And further, to give a name to each machine, make a mapping out of them.
addresses = {
"Scale1": "http://10.56.32.247/",
"Scale2": "http://10.56.32.248/",
"Scale3": "http://10.56.32.246/",
}
def get_statuses():
results = {}
for name, address in addresses.items():
resp = requests.get(address)
if resp.status_code != 200:
results[name] = f"Error {resp.status_code}"
else:
soup = BeautifulSoup(resp.text, "lxml")
results[name] = soup.find_all("h3")[0].text
return results

Search Splunk API using python

What I am trying to do is perform a search on Splunk's API using python, I am able to get a session key but thats it. I'm new to both python and splunk so im a bit out-of-depth and any help would be really appreciated.
The error:
Traceback (most recent call last):
File "splunkAPI.py", line 31, in <module>
sid = minidom.parseString(r.text).getElementsByTagName('sid')[0].firstChild.nodeValue
IndexError: list index out of range
python:
import time # need for sleep
from xml.dom import minidom
import json, pprint
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
base_url = 'https://___________:8089'
username = '______'
password = '______'
search_query = "____________"
#-------------------------get session token------------------------
r = requests.get(base_url+"/servicesNS/admin/search/auth/login",
data={'username':username,'password':password}, verify=False)
session_key = minidom.parseString(r.text).getElementsByTagName('sessionKey')[0].firstChild.nodeValue
print ("Session Key:", session_key)
#-------------------- perform search -------------------------
r = requests.post(base_url + '/services/search/jobs/', data=search_query,
headers = { 'Authorization': ('Splunk %s' %session_key)},
verify = False)
sid = minidom.parseString(r.text).getElementsByTagName('sid')[0].firstChild.nodeValue
done = False
while not done:
r = requests.get(base_url + '/services/search/jobs/' + sid,
headers = { 'Authorization': ('Splunk %s' %session_key)},
verify = False)
response = minidom.parseString(r.text)
for node in response.getElementsByTagName("s:key"):
if node.hasAttribute("name") and node.getAttribute("name") == "dispatchState":
dispatchState = node.firstChild.nodeValue
print ("Search Status: ", dispatchState)
if dispatchState == "DONE":
done = True
else:
time.sleep(1)
r = requests.get(base_url + '/services/search/jobs/' + sid + '/results/',
headers = { 'Authorization': ('Splunk %s' %session_key)},
data={'output_mode': 'json'},
verify = False)
pprint.pprint(json.loads(r.text))

Hmm... that code looks awfully familiar :P Unfortunately, error checking wasn't that important when I wrote it.
The issue you see occurs if the search_query is not defined properly. It must start with search=. Also note that you need to include an initial search command if doing a standard Splunk search,
For example, search=search index=* will work, search=index=* will not work.
If you need to include quotes in your search string, I suggest you use something like the following format.
search_query = """search=search index=* "a search expression" | stats count"""

Tried this but did not give needed result not sure what is missing
import urllib
import httplib2 #import library
import json
import pprint
import time
import re
from xml.dom import minidom
searchquery = 'search index="movable_in" sourcetype="movable:in:assets" | stats avg(exposure_score)'
myhttp = httplib2.Http()
baseurl = 'https://xxxx.splunkxxx.com:8089'
usernamesp = 'xxxx'
passwordsp = 'xxxx'
def get_splunk_result(searchquery):
# Step 1: Get a session key
servercontent = myhttp.request(f'{baseurl}/services/auth/login', 'POST', headers={},
body=urllib.parse.urlencode({'username': usernamesp, 'password': passwordsp}))[1]
sessionkey = minidom.parseString(servercontent).getElementsByTagName('sessionKey')[0].childNodes[0].nodeValue
# print ("====>sessionkey: %s <====" % sessionkey)
sid = ''
# ------------------
if not searchquery.startswith('search'):
searchquery = f'search {searchquery}'
# Step 2: Get a sid with the search query
i = 0
while True:
time.sleep(1)
try:
searchjob = myhttp.request(f'{baseurl}/services/search/jobs', 'POST',
headers={F'Authorization': F'Splunk %s' % sessionkey},
body=urllib.parse.urlencode({'search': searchquery}))[1]
sid = minidom.parseString(searchjob).getElementsByTagName('sid')[0].childNodes[0].nodeValue
break
except:
i = i + 1
# print(i)
if (i > 30): break
# print("====>SID: %s <====" % sid)
# Step 3: Get search status
myhttp.add_credentials(usernamesp, passwordsp)
servicessearchstatusstr = '/services/search/jobs/%s/' % sid
isnotdone = True
while isnotdone:
searchstatus = myhttp.request(f'{baseurl}{servicessearchstatusstr}', 'GET')[1]
isdonestatus = re.compile('isDone">(0|1)')
strstatus = str(searchstatus)
isdonestatus = isdonestatus.search(strstatus).groups()[0]
if (isdonestatus == '1'):
isnotdone = False
# Step 4: Get the search result
services_search_results_str = '/services/search/jobs/%s/results?output_mode=json_rows&count=0' % sid
searchresults = myhttp.request(f'{baseurl}{services_search_results_str}', 'GET')[1]
searchresults = json.loads(searchresults)
# searchresults = splunk_result(searchresults)
return searchresults
output = get_splunk_result(searchquery)
print(output)

it says "RESTART: C:\python\python ex\facebook EX.py" nothing else

import sys
import urllib.request
import json
if __name__ == '_main_':
# [CODE1]
page_name = "jtbnews"
app_id = "[App ID]"
app_secret = "[App Secret Code]"
access_token = app_id + "I" +app_secret
# [CODE2]
# http://graph.facebook.com/v2.8/[page_id]/?access_token=[App_ID]I[Secret_Key]
# 형식의 문자열을 만들어 낸다.
base = "http://graph.facebook.com/v2.8"
node = "/" + page_name
parameters = "/?access_token=%s" % access_token
url = base + node + parameters
# [CODE3]
req = urllib.request.Request(url)
# [CODE4]
try:
response = urllib.request.urlopen(req)
if response.getcode() == 200:
data = json.loads(response.read().decode('utf-8'))
page_id = data['id']
print("%s Facebook numeric ID : %s" % (page_name, page_id))
except Exception as e: print(e)
I'm a 100% beginner of programming
I follow the code in a book.
I can't figure out what is the problem.
it just says:
"RESTART: C:\python\python ex\facebook EX.py"

Use of files on hard drives instead of url with python

I would like to modify this script to use offline files, if I download the files from url works, but if the same file as I withdraw from hard drives, does not open, someone helps me to understand why and how to do, thank you.
def INDEX():
TVLIST('https://www.*********/playlist/*******/test.m3u')
def TVLIST(url):
try:
m3u = getHtml(url)
parsem3u(m3u)
except:
addDir('Nothing found', '', '', '', Folder=False)
xbmcplugin.endOfDirectory(int(sys.argv[1]))
urlopen = urllib2.urlopen
Request = urllib2.Request
def getHtml(url, referer=None, hdr=None, data=None):
if not hdr:
req = Request(url, data, headers)
else:
req = Request(url, data, hdr)
if referer:
req.add_header('Referer', referer)
if data:
req.add_header('Content-Length', len(data))
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
f.close()
else:
data = response.read()
response.close()
return data
def parsem3u(html, sitechk=True):
match = re.compile('#.+,(.+?)\n(.+?)\n').findall(html)
txtfilter = txtfilter = GETFILTER()
txtfilter = txtfilter.split(',') if txtfilter else []
txtfilter = [f.lower().strip() for f in txtfilter]
i = 0
count = 0
for name, url in match:
status = ""
url = url.replace('\r','')
if not txtfilter or any(f in name.lower() for f in txtfilter):
if sitechk:
if i < 5:
try:
siteup = urllib.urlopen(url).getcode()
status = " [COLOR red]offline[/COLOR]" if siteup != 200 else " [COLOR green]online[/COLOR]"
except: status = " [COLOR red]offline[/COLOR]"
i += 1
addPlayLink(name+status, url, 3, uiptvicon)
count += 1
return count
I thought, was enough to put the local path
def INDEX():
TVLIST(r'c:\Desktop\IPTVLIST\M3U\playlist\test.m3u')
who explains why it does not work and how can I do? Thank you

As suggested by #languitar in the comments you would have file:// which of course it should work for windows, but moving to a platform like android, you have different file system there, you don't have C drive. So make sure you got an alternative location on the android.

Parse JSON output in Python using Requests and its sessions

Here I have a rate stream that outputs the following and i'm looking to only print the "bid" price. Could someone help explain how I can parse the output correctly? It's driving me crazy!
example = 1.05653
I need the output without quotes or any other markup as well..
JSON
{
"tick": {
"instrument": "EUR_USD",
"time": "2015-04-13T14:28:26.123314Z",
"bid": 1.05653,
"ask": 1.05669
}
}
My code:
import requests
import json
from optparse import OptionParser
def connect_to_stream():
"""
Environment <Domain>
fxTrade stream-fxtrade.oanda.com
fxTrade Practice stream-fxpractice.oanda.com
sandbox stream-sandbox.oanda.com
"""
# Replace the following variables with your personal ones
domain = 'stream-fxpractice.oanda.com'
access_token = 'xxxxx'
account_id = 'xxxxxxxxx'
instruments = "EUR_USD"
try:
s = requests.Session()
url = "https://" + domain + "/v1/prices"
headers = {'Authorization' : 'Bearer ' + access_token,
# 'X-Accept-Datetime-Format' : 'unix'
}
params = {'instruments' : instruments, 'accountId' : account_id}
req = requests.Request('GET', url, headers = headers, params = params)
pre = req.prepare()
resp = s.send(pre, stream = True, verify = False)
return resp
except Exception as e:
s.close()
print "Caught exception when connecting to stream\n" + str(e)
def demo(displayHeartbeat):
response = connect_to_stream()
if response.status_code != 200:
print response.text
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if msg.has_key("instrument") or msg.has_key("tick"):
print line
if displayHeartbeat:
print line
else:
if msg.has_key("instrument") or msg.has_key("tick"):
print line
def main():
usage = "usage: %prog [options]"
parser = OptionParser(usage)
parser.add_option("-b", "--displayHeartBeat", dest = "verbose", action = "store_true",
help = "Display HeartBeat in streaming data")
displayHeartbeat = False
(options, args) = parser.parse_args()
if len(args) > 1:
parser.error("incorrect number of arguments")
if options.verbose:
displayHeartbeat = True
demo(displayHeartbeat)
if __name__ == "__main__":
main()
Sorry if this is an extremely basic question but I'm not that familiar with python..
Thanks in advance!

You are iterating over the stream line by line attempting to parse each line as JSON. Each line alone is not proper JSON so that's one problem.
I would just regex over each hline you bring in looking for the text "bid: " followed by a decimal number, and return that number as a float. For example:
import re
for line in response.iter_lines(1):
matches = re.findall(r'\"bid\"\:\s(\d*\.\d*)', line)
if len(matches) > 0:
print float(matches[0])

Try something along the lines of this:
def demo(displayHeartbeat):
response = connect_to_stream()
for line in response.iter_lines():
if line.startswith(" \"bid\"")
print "bid:"+line.split(":")[1]

This actually turned out to be pretty easy, I fixed it by replacing the "demo" function with this:
def demo(displayHeartbeat):
response = connect_to_stream()
if response.status_code != 200:
print response.text
return
for line in response.iter_lines(1):
if line:
try:
msg = json.loads(line)
except Exception as e:
print "Caught exception when converting message into json\n" + str(e)
return
if displayHeartbeat:
print line
else:
if msg.has_key("instrument") or msg.has_key("tick"):
print msg["tick"]["ask"] - .001
instrument = msg["tick"]["instrument"]
time = msg["tick"]["time"]
bid = msg["tick"]["bid"]
ask = msg["tick"]["ask"]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Saving dictionary of tweets into JSON file results in an empty dictionary - python

Related

Loop Through IP Address Range to Check Printer Status in HTML - Python & BeuatifulSoup

Search Splunk API using python

it says "RESTART: C:\python\python ex\facebook EX.py" nothing else

Use of files on hard drives instead of url with python

Parse JSON output in Python using Requests and its sessions

Categories

Resources