Use of files on hard drives instead of url with python - python

I would like to modify this script to use offline files, if I download the files from url works, but if the same file as I withdraw from hard drives, does not open, someone helps me to understand why and how to do, thank you.
def INDEX():
TVLIST('https://www.*********/playlist/*******/test.m3u')
def TVLIST(url):
try:
m3u = getHtml(url)
parsem3u(m3u)
except:
addDir('Nothing found', '', '', '', Folder=False)
xbmcplugin.endOfDirectory(int(sys.argv[1]))
urlopen = urllib2.urlopen
Request = urllib2.Request
def getHtml(url, referer=None, hdr=None, data=None):
if not hdr:
req = Request(url, data, headers)
else:
req = Request(url, data, hdr)
if referer:
req.add_header('Referer', referer)
if data:
req.add_header('Content-Length', len(data))
response = urlopen(req)
if response.info().get('Content-Encoding') == 'gzip':
buf = StringIO( response.read())
f = gzip.GzipFile(fileobj=buf)
data = f.read()
f.close()
else:
data = response.read()
response.close()
return data
def parsem3u(html, sitechk=True):
match = re.compile('#.+,(.+?)\n(.+?)\n').findall(html)
txtfilter = txtfilter = GETFILTER()
txtfilter = txtfilter.split(',') if txtfilter else []
txtfilter = [f.lower().strip() for f in txtfilter]
i = 0
count = 0
for name, url in match:
status = ""
url = url.replace('\r','')
if not txtfilter or any(f in name.lower() for f in txtfilter):
if sitechk:
if i < 5:
try:
siteup = urllib.urlopen(url).getcode()
status = " [COLOR red]offline[/COLOR]" if siteup != 200 else " [COLOR green]online[/COLOR]"
except: status = " [COLOR red]offline[/COLOR]"
i += 1
addPlayLink(name+status, url, 3, uiptvicon)
count += 1
return count
I thought, was enough to put the local path
def INDEX():
TVLIST(r'c:\Desktop\IPTVLIST\M3U\playlist\test.m3u')
who explains why it does not work and how can I do? Thank you

As suggested by #languitar in the comments you would have file:// which of course it should work for windows, but moving to a platform like android, you have different file system there, you don't have C drive. So make sure you got an alternative location on the android.

Related

Tweets streaming to .txt file with Python

I have the below code and want to write the stream of tweets to a text file. Is there a way to include the output to text file within the same code and save it in the working directory? I am an IDE lover and really don't like using the console. I am new to python (2 weeks), I am an R / R Studio user.
I know I could use:
filename.py > output.txt
I am currently using Rodeo, Python 3.6.1.
import oauth2 as oauth
import urllib.request as urllib
api_key = "##"
api_secret = "##"
access_token_key = "##-##"
access_token_secret = "##"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
f = open("output.txt", "wb")
def fetchsamples():
url = "https://stream.twitter.com/1.1/statuses/sample.json"
parameters = []
response = twitterreq(url, "GET", parameters)
for line in response:
f.write(line)
if __name__ == '__main__':
fetchsamples()
# f.close()
Besides the comment I made previously, I would suggesting checking out this stack overflow question: how to direct output into a txt file in python in windows
To quote:
If you want to do it in Python then you would write:
with open('out.txt', 'w') as f:
f.write(something)`
Obviously this is just a trivial example. You'd clearly do more inside the with block.

How to optimize the memory usage of my python crawler

I am learning python crawler these days, and I write a simple crawler to get the picture on the Pixiv by Pixiv ID.
It works quite well, but here comes a big problem: When it is running, it takes up nearly 1.2G memory on my computer.
However, sometimes it just takes up just 10M memory, I really don't know which code causes such big usage of memory.
I have uploaded the script to my VPS(Only 768M memory Vulter server) and tried to run. As a result, I get a MerroyError.
So I wonder how to optimize the memory usage(even if taking more time to run).
Here is my code:
(I have rewrote all the code to make it pass pep8, if still unclear, please tell me which code makes you confused.)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[#class="page-menu"]'
'/div[#class="page"]'
'/span[#class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/#href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[#id="wrapper"]'
'/div[#class="newindex"]'
'/div[#class="newindex-inner"]'
'/div[#class="newindex-bg-container"]'
'/div[#class="cool-work"]'
'/div[#class="cool-work-main"]'
'/div[#class="img-container"]'
'/a/img/#src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"
OMG!
Finally, I know what goes wrong.
I use mem_top() to see what takes up the memory.
Guess what?
It is for i in range(0, 38849402):
In the memory, there is a list [0, 1, 2, 3 ... 38849401], which takes up my memory.
I change it to :
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
Now the memory usage is just no more than 20M.
Feeling excited!

Saving dictionary of tweets into JSON file results in an empty dictionary

I am trying to collect some localized tweets and store them on my hard drive as a dictionary of tweets. In some iterations in the fetchsamples function, the saved dictionary is forced into empty state despite the fact that during the for loop data is added into the dictionary (see output below).
I have tried different encodings or passing "w" and "wb" flags to my save function but it didn't help.
I tried reproducing this using random strings (to allow people easier checking of my code) but I was unable to. I am unsure what in the tweet structure or my code is causing this behaviour.
NOTE: I have added a code snippet to catch when the dictionary is forced into empty state for debugging.
import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os
api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
return response
def fetchsamples():
url = "https://stream.twitter.com/1/statuses/sample.json"
url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
parameters = []
response = twitterreq(url, "GET", parameters)
data = {}
count = 1
for line in response:
try:
strip = json.loads(line.strip())
if strip['coordinates'] != None:
data[count] = strip
count += 1
if count % 10 == 0:
print count, len(data.keys())
except Exception as e:
# Print error and store in a log file
print e
with open("/Temp/Data/error.log","w") as log:
log.write(str(e))
# If 100 tweets have passed save the file
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
# This code is for debug purposes to catch when dictionary
# when dictionary is forced into empty state
if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
print "After saving: ", len(data.keys())
return data
else:
data = {}
data = fetchsamples()
This produces the following output with no error. The data dictionary is empty.
100 99
Before saving: 99
110 10
120 20
130 30
140 40
150 50
160 60
170 70
180 80
190 90
200 100
Before saving: 100
Before saving: 0
After saving: 0
The dictionary is empty because after each 100 iterations, you either set data={} or the dictionary is already empty. If i understand correctly, you would need another dictionary, one which you never empty, and push items to that dictionary also.
import oauth2 as oauth
import urllib2 as urllib
import json
import pickle
import os
api_key = "Insert api_key here"
api_secret = "Insert api_secret here"
access_token_key = "Insert access_token_key"
access_token_secret = "Insert access_token_secret"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
return response
def fetchsamples():
url = "https://stream.twitter.com/1/statuses/sample.json"
url = "https://stream.twitter.com/1/statuses/filter.json?locations=-0.489,51.28,0.236,51.686"
parameters = []
response = twitterreq(url, "GET", parameters)
data = {}
allData = {}
count = 1
for line in response:
try:
strip = json.loads(line.strip())
if strip['coordinates'] != None:
data[count] = strip
allData[count] = strip
count += 1
if count % 10 == 0:
print count, len(data.keys())
except Exception as e:
# Print error and store in a log file
print e
with open("/Temp/Data/error.log","w") as log:
log.write(str(e))
# If 100 tweets have passed save the file
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
# Return data if the file is empty and stop
if os.path.getsize("/Temp/Data/"+str(count/100)+".json") < 10:
print "After saving: ", len(data.keys())
return allData
else:
data = {}
data = fetchsamples()
The problem is in the way I incremented the count value. Since count is incremented only when strip["coordinates"] != None if I receive a tweet where strip["coordinates"] == None the count value does not get incremented but data = {} and count % 100 == 0 gives True, which means the original non-empty file is replaced with an empty one.
The solution is to increment count after saving such as here:
if count % 100 == 0:
print "Before saving: ", len(data.keys())
fp = open("/Temp/Data/"+str(count/100)+".json","w")
json.dump(data,fp,encoding="latin-1")
fp.close()
count += 1

Python implementation for WorkEtc. API

I've found this post on the Work Etc. forums for a Python REST client and the forum used didn't include the indentation for the code nor did the author include them, so what I've done is input what I believe to be the correct indentation and have gotten the following:
import sys, json, urllib
from httplib2 import Http
class WORKetcRESTClient():
session_key = None
connector_hash = None
def __init__(self,url):
if not "http://" in url and not "https://" in url:
url = "http://%s" % url
self.base_url = url
else:
self.base_url = url
def authenticate(self,user,password):
args = { "email" : user,"pass" : password,}
res = self.request("AuthenticateWebSafe", args)
if res["Code"] == 1:
self.session_key = res["SessionKey"]
self.user = res["User"]
return True
else:
return False
def request(self,service,args):
url = "%s/%s" % (self.base_url, service)
if not self.session_key is None:
url = "%s?VeetroSession=%s" %(url,self.session_key)
p = {}
p['data'] = json.dumps(args)
h = Http()
r,c = h.request(url, body=json.dumps(args), method="POST", headers = {'contentType' : 'application/json; charset=utf-8'})
if r.status == 200:
jsondata = json.loads(c)
return jsondata
else:
print r,c
return {}
client = WORKetcRESTClient('http://company.worketc.com')
client.authenticate('User#company.com', 'pAsSwOrD')
result = client.request('FindCompanies',{'keywords':'customer'})
print result
To be 100% honest, if this were to run without any errors, I'm not sure what I would get printed to the console, but the errors I'm getting are keeping me from finding out:
Traceback (most recent call last):
File "worketc.py", line 42, in <module>
File "worketc.py", line 17, in authenticate
res = self.request("AuthenticateWebSafe", args)
File "worketc.py", line 34, in request
if r.status == 200:
UnboundLocalError: local variable 'r' referenced before assignment
It's telling me that the variable 'r' is getting called before it is assigned/created, but I'm not sure where it needs to be created, or moved to, considering it's location in the current script. Can anybody shed some light onto this?
if not self.session_key is None:
url = "%s?VeetroSession=%s" %(url,self.session_key)
p = {}
p['data'] = json.dumps(args)
h = Http()
r,c = h.request(url, body=json.dumps(args), method="POST", headers = {'contentType' : 'application/json; charset=utf-8'})
is the correct indentation

Problems using multipart_encode (poster library)

I am trying to upload a file using multipart_encode to realize the MIME process. However, I met the following error AttributeError: multipart_yielder instance has no attribute '__len__'. Below are is my approach, I really appreciate if anyone can give me some suggestions.
url = "https://pi-user-files.s3-external-1.amazonaws.com/"
post_data = {}
#data is a dict
post_data['AWSAccessKeyId']=(data['ticket']['AWSAccessKeyId'])
post_data['success_action_redirect']=(data['ticket']['success_action_redirect'])
post_data['acl']=(data['ticket']['acl'])
post_data['key']=(data['ticket']['key'])
post_data['signature']=(data['ticket']['signature'])
post_data['policy']=(data['ticket']['policy'])
post_data['Content-Type']=(data['ticket']['Content-Type'])
#I would like to upload a text file "new 2"
post_data['file']=open("new 2.txt", "rb")
datagen, headers = multipart_encode(post_data)
request2 = urllib2.Request(url, datagen, headers)
result = urllib2.urlopen(request2)
If you want to send a file you should wrap other parameters with a MultipartParam object, example code for creating a send file request:
from poster.encode import multipart_encode, MultipartParam
import urllib2
def postFileRequest(url, paramName, fileObj, additionalHeaders={}, additionalParams={}):
items = []
#wrap post parameters
for name, value in additionalParams.items():
items.append(MultipartParam(name, value))
#add file
items.append(MultipartParam.from_file(paramName, fileObj))
datagen, headers = multipart_encode(items)
#add headers
for item, value in additionalHeaders.iteritems():
headers[item] = value
return urllib2.Request(url, datagen, headers)
Also I think you should execute register_openers() once at the beginning. Some details you can find in docs
The problem is that in httplib.py, the generator is not detected as such and is treated instead like a string that holds the full data to be sent (and therefore it tries to find its length):
if hasattr(data,'read') and not isinstance(data, array): # generator
if self.debuglevel > 0: print "sendIng a read()able"
....
A solution is to make the generator act like a read()able:
class GeneratorToReadable():
def __init__(self, datagen):
self.generator = datagen
self._end = False
self.data = ''
def read(self, n_bytes):
while not self._end and len(self.data) < n_bytes:
try:
next_chunk = self.generator.next()
if next_chunk:
self.data += next_chunk
else:
self._end = True
except StopIteration:
self._end = True
result = self.data[0:n_bytes]
self.data = self.data[n_bytes:]
return result
and use like so:
datagen, headers = multipart_encode(post_data)
readable = GeneratorToReadable(datagen)
req = urllib2.Request(url, readable, headers)
result = urllib2.urlopen(req)

Categories