I tried to run a simple crawler in Python:
import sys
import csv
import socket
import sqlite3
import logging
from optparse import OptionParser
from urlparse import urlparse
#pip install requests
import requests
#################################################################
# FUNCTION process_row_to_db.
# handle one row and push to the DB
#
#################################################################
def process_row_to_db(conn, data_row, comment, hostname):
insert_stmt = "INSERT OR IGNORE INTO adstxt (SITE_DOMAIN, EXCHANGE_DOMAIN, SELLER_ACCOUNT_ID, ACCOUNT_TYPE, TAG_ID, ENTRY_COMMENT) VALUES (?, ?, ?, ?, ?, ? );"
exchange_host = ''
seller_account_id = ''
account_type = ''
tag_id = ''
if len(data_row) >= 3:
exchange_host = data_row[0].lower()
seller_account_id = data_row[1].lower()
account_type = data_row[2].lower()
if len(data_row) == 4:
tag_id = data_row[3].lower()
#data validation heurstics
data_valid = 1;
# Minimum length of a domain name is 1 character, not including extensions.
# Domain Name Rules - Nic AG
# www.nic.ag/rules.htm
if(len(hostname) < 3):
data_valid = 0
if(len(exchange_host) < 3):
data_valid = 0
# could be single digit integers
if(len(seller_account_id) < 1):
data_valid = 0
## ads.txt supports 'DIRECT' and 'RESELLER'
if(len(account_type) < 6):
data_valid = 0
if(data_valid > 0):
logging.debug( "%s | %s | %s | %s | %s | %s" % (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Insert a row of data using bind variables (protect against sql injection)
c = conn.cursor()
c.execute(insert_stmt, (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Save (commit) the changes
conn.commit()
return 1
return 0
# end process_row_to_db #####
#################################################################
# FUNCTION crawl_to_db.
# crawl the URLs, parse the data, validate and dump to a DB
#
#################################################################
def crawl_to_db(conn, crawl_url_queue):
rowcnt = 0
myheaders = {
'User-Agent': 'AdsTxtCrawler/1.0; +https://github.com/InteractiveAdvertisingBureau/adstxtcrawler',
'Accept': 'text/plain',
}
for aurl in crawl_url_queue:
ahost = crawl_url_queue[aurl]
logging.info(" Crawling %s : %s " % (aurl, ahost))
r = requests.get(aurl, headers=myheaders)
logging.info(" %d" % r.status_code)
if(r.status_code == 200):
logging.debug("-------------")
logging.debug(r.request.headers)
logging.debug("-------------")
logging.debug("%s" % r.text)
logging.debug("-------------")
tmpfile = 'tmpads.txt'
with open(tmpfile, 'wb') as tmp_csv_file:
tmp_csv_file.write(r.text)
tmp_csv_file.close()
with open(tmpfile, 'rb') as tmp_csv_file:
#read the line, split on first comment and keep what is to the left (if any found)
line_reader = csv.reader(tmp_csv_file, delimiter='#', quotechar='|')
comment = ''
for line in line_reader:
logging.debug("DATA: %s" % line)
try:
data_line = line[0]
except:
data_line = "";
#determine delimiter, conservative = do it per row
if data_line.find(",") != -1:
data_delimiter = ','
elif data_line.find("\t") != -1:
data_delimiter = '\t'
else:
data_delimiter = ' '
data_reader = csv.reader([data_line], delimiter=',', quotechar='|')
for row in data_reader:
if len(row) > 0 and row[0].startswith( '#' ):
continue
if (len(line) > 1) and (len(line[1]) > 0):
comment = line[1]
rowcnt = rowcnt + process_row_to_db(conn, row, comment, ahost)
return rowcnt
# end crawl_to_db #####
#################################################################
# FUNCTION load_url_queue
# Load the target set of URLs and reduce to an ads.txt domains queue
#
#################################################################
def load_url_queue(csvfilename, url_queue):
cnt = 0
with open(csvfilename, 'rb') as csvfile:
targets_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in targets_reader:
if len(row) < 1 or row[0].startswith( '#' ):
continue
for item in row:
host = "localhost"
if "http:" in item or "https:" in item :
logging.info( "URL: %s" % item)
parsed_uri = urlparse(row[0])
host = parsed_uri.netloc
else:
host = item
logging.info( "HOST: %s" % item)
skip = 0
try:
#print "Checking DNS: %s" % host
ip = socket.gethostbyname(host)
if "127.0.0" in ip:
skip = 0 #swap to 1 to skip localhost testing
elif "0.0.0.0" in ip:
skip = 1
else:
logging.info(" Validated Host IP: %s" % ip)
except:
skip = 1
if(skip < 1):
ads_txt_url = 'http://{thehost}/ads.txt'.format(thehost=host)
logging.info(" pushing %s" % ads_txt_url)
url_queue[ads_txt_url] = host
cnt = cnt + 1
return cnt
# end load_url_queue #####
#### MAIN ####
arg_parser = OptionParser()
arg_parser.add_option("-t", "--targets", dest="target_filename",
help="list of domains to crawl ads.txt from", metavar="FILE")
arg_parser.add_option("-d", "--database", dest="target_database",
help="Database to dump crawled data into", metavar="FILE")
arg_parser.add_option("-v", "--verbose", dest="verbose", action='count',
help="Increase verbosity (specify multiple times for more)")
(options, args) = arg_parser.parse_args()
if len(sys.argv)==1:
arg_parser.print_help()
exit(1)
log_level = logging.WARNING # default
if options.verbose == 1:
log_level = logging.INFO
elif options.verbose >= 2:
log_level = logging.DEBUG
logging.basicConfig(filename='adstxt_crawler.log',level=log_level,format='%(asctime)s %(filename)s:%(lineno)d:%(levelname)s %(message)s')
crawl_url_queue = {}
conn = None
cnt_urls = 0
cnt_records = 0
cnt_urls = load_url_queue(options.target_filename, crawl_url_queue)
if (cnt_urls > 0) and options.target_database and (len(options.target_database) > 1):
conn = sqlite3.connect(options.target_database)
with conn:
cnt_records = crawl_to_db(conn, crawl_url_queue)
if(cnt_records > 0):
conn.commit()
#conn.close()
print "Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database)
logging.warning("Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database))
logging.warning("Finished.")
I'm using Python 2.7.9.
I tried to install sqlite with this command:
python -m pip install sqlite
I got back this:
Downloading/unpacking sqlite3 Could not find any downloads that
satisfy the requirement sqlite3 Cleaning up... No distributions at all
found for sqlite3 Storing debug log for failure in ...\pip.log
First step would be this command:
$sqlite3 adstxt.db < adstxt_crawler.sql
I got these:
"'sqlite3' is not recognized as an internal or external command, operable program or batch file."
I know it's very basic, but I haven't found any relevant help, if you could help me, I really apprecitiate it.
Thanks.
Adam
The first error:
'sqlite3' is not recognized as an internal or external command, operable program or batch file.
Is because you try to run sqlite command line tool, which is not installed on your system. Python 3 includes sqlite but does not provide the standalone command sqlite3
The second error is a syntax error. In Python 3, print is a standard function, so must be used with parenthesis
print('hello world')
You probably tried to run python 2 code with Python 3 interpreter
Related
springerlink has changed its structure, and now the script doesn't work anymore. With it you should could download all chapters at once instead of all single chapters.
i installed the script and its dependencies with linux.
from here http://milianw.de/code-snippets/take-2-download-script-for-springerlinkcom-ebooks and here https://github.com/milianw/springer_download
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import getopt
import urllib
import re
import tempfile
import shutil
import subprocess
# Set some kind of User-Agent so we don't get blocked by SpringerLink
class SpringerURLopener(urllib.FancyURLopener):
version = "Mozilla 5.0"
def pdfcat(fileList, bookTitlePath):
if findInPath("pdftk") != False:
command = [findInPath("pdftk")]
command.extend(fileList)
command.extend(["cat", "output", bookTitlePath])
subprocess.Popen(command, shell=False).wait()
elif findInPath("stapler") != False:
command = [findInPath("stapler"), "cat"]
command.extend(fileList)
command.append(bookTitlePath)
subprocess.Popen(command, shell=False).wait()
else:
error("You have to install pdftk (http://www.accesspdf.com/pdftk/) or stapler (http://github.com/hellerbarde/stapler).")
# validate CLI arguments and start downloading
def main(argv):
if not findInPath("iconv"):
error("You have to install iconv.")
#Test if convert is installed
if os.system("convert --version > /dev/null 2>&1")!=0:
error("You have to install the packet ImageMagick in order to use convert")
try:
opts, args = getopt.getopt(argv, "hl:c:n", ["help", "link=", "content=", "no-merge"])
except getopt.GetoptError:
error("Could not parse command line arguments.")
link = ""
hash = ""
merge = True
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-c", "--content"):
if link != "":
usage()
error("-c and -l arguments are mutually exclusive")
hash = arg
elif opt in ("-l", "--link"):
if hash != "":
usage()
error("-c and -l arguments are mutually exclusive")
match = re.match("(https?://)?(www\.)?springer(link)?.(com|de)/(content|.*book)/(?P<hash>[a-z0-9\-]+)/?(\?[^/]*)?$", arg)
if not match:
usage()
error("Bad link given. See example link.")
hash = match.group("hash")
elif opt in ("-n", "--no-merge"):
merge = False
if hash == "":
usage()
error("Either a link or a hash must be given.")
if merge and not findInPath("pdftk") and not findInPath("stapler"):
error("You have to install pdftk (http://www.accesspdf.com/pdftk/) or stapler (http://github.com/hellerbarde/stapler).")
baseLink = "http://www.springerlink.com/content/" + hash + "/"
link = baseLink + "contents/"
chapters = list()
loader = SpringerURLopener();
curDir = os.getcwd()
bookTitle = ""
coverLink = ""
front_matter = False
while True:
# download page source
try:
print "fetching book information...\n\t%s" % link
page = loader.open(link,"MUD=MP").read()
except IOError, e:
error("Bad link given (%s)" % e)
if re.search(r'403 Forbidden', page):
error("Could not access page: 403 Forbidden error.")
if bookTitle == "":
match = re.search(r'<h1[^<]+class="title">(.+?)(?:<br/>\s*<span class="subtitle">(.+?)</span>\s*)?</h1>', page, re.S)
if not match or match.group(1).strip() == "":
error("Could not evaluate book title - bad link %s" % link)
else:
bookTitle = match.group(1).strip()
# remove tags, e.g. <sub>
bookTitle = re.sub(r'<[^>]*?>', '', bookTitle)
# subtitle
if match and match.group(2) and match.group(2).strip() != "":
bookTitle += " - " + match.group(2).strip()
# edition
#match = re.search(r'<td class="labelName">Edition</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " " + match.group(1).strip()
## year
#match = re.search(r'<td class="labelName">Copyright</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " " + match.group(1).strip()
## publisher
#match = re.search(r'<td class="labelName">Publisher</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " - " + match.group(1).strip()
# coverimage
match = re.search(r'<div class="coverImage" title="Cover Image" style="background-image: url\(/content/([^/]+)/cover-medium\.gif\)">', page)
if match:
coverLink = "http://www.springerlink.com/content/" + match.group(1) + "/cover-large.gif"
bookTitlePath = curDir + "/%s.pdf" % sanitizeFilename(bookTitle)
if bookTitlePath == "":
error("could not transliterate book title %s" % bookTitle)
if os.path.isfile(bookTitlePath):
error("%s already downloaded" % bookTitlePath)
print "\nNow Trying to download book '%s'\n" % bookTitle
#error("foo")
# get chapters
for match in re.finditer('href="([^"]+\.pdf)"', page):
chapterLink = match.group(1)
if chapterLink[:7] == "http://": # skip external links
continue
if re.search(r'front-matter.pdf', chapterLink):
if front_matter:
continue
else:
front_matter = True
if re.search(r'back-matter.pdf', chapterLink) and re.search(r'<a href="([^"#]+)"[^>]*>Next</a>', page):
continue
#skip backmatter if it is in list as second chapter - will be there at the end of the book also
if re.search(r'back-matter.pdf', chapterLink):
if len(chapters)<2:
continue
chapters.append(chapterLink)
# get next page
match = re.search(r'<a href="([^"#]+)"[^>]*>Next</a>', page)
if match:
link = "http://www.springerlink.com" + match.group(1).replace("&", "&")
else:
break
if len(chapters) == 0:
error("No chapters found - bad link?")
print "found %d chapters" % len(chapters)
# setup; set tempDir as working directory
tempDir = tempfile.mkdtemp()
os.chdir(tempDir)
i = 1
fileList = list()
for chapterLink in chapters:
if chapterLink[0] == "/":
chapterLink = "http://www.springerlink.com" + chapterLink
else:
chapterLink = baseLink + chapterLink
chapterLink = re.sub("/[^/]+/\.\.", "", chapterLink)
print "downloading chapter %d/%d" % (i, len(chapters))
localFile, mimeType = geturl(chapterLink, "%d.pdf" % i)
if mimeType.gettype() != "application/pdf":
os.chdir(curDir)
shutil.rmtree(tempDir)
error("downloaded chapter %s has invalid mime type %s - are you allowed to download %s?" % (chapterLink, mimeType.gettype(), bookTitle))
fileList.append(localFile)
i += 1
if coverLink != "":
print "downloading front cover from %s" % coverLink
localFile, mimeType = geturl(coverLink, "frontcover")
if os.system("convert %s %s.pdf" % (localFile, localFile)) == 0:
fileList.insert(0, localFile + ".pdf")
if merge:
print "merging chapters"
if len(fileList) == 1:
shutil.move(fileList[0], bookTitlePath)
else:
pdfcat(fileList, bookTitlePath)
# cleanup
os.chdir(curDir)
shutil.rmtree(tempDir)
print "book %s was successfully downloaded, it was saved to %s" % (bookTitle, bookTitlePath)
log("downloaded %s chapters (%.2fMiB) of %s\n" % (len(chapters), os.path.getsize(bookTitlePath)/2.0**20, bookTitle))
else: #HL: if merge=False
print "book %s was successfully downloaded, unmerged chapters can be found in %s" % (bookTitle, tempDir)
log("downloaded %s chapters of %s\n" % (len(chapters), bookTitle))
sys.exit()
# give a usage message
def usage():
print """Usage:
%s [OPTIONS]
Options:
-h, --help Display this usage message
-l LINK, --link=LINK defines the link of the book you intend to download
-c ISBN, --content=ISBN builds the link from a given ISBN (see below)
-n, --no-merge Only download the chapters but don't merge them into a single PDF.
You have to set exactly one of these options.
LINK:
The link to your the detail page of the ebook of your choice on SpringerLink.
It lists book metadata and has a possibly paginated list of the chapters of the book.
It has the form:
http://www.springerlink.com/content/ISBN/STUFF
Where: ISBN is a string consisting of lower-case, latin chars and numbers.
It alone identifies the book you intent do download.
STUFF is optional and looks like #section=... or similar. It will be stripped.
""" % os.path.basename(sys.argv[0])
# raise an error and quit
def error(msg=""):
if msg != "":
log("ERR: " + msg + "\n")
print "\nERROR: %s\n" % msg
sys.exit(2)
return None
# log to file
def log(msg=""):
logFile = open('springer_download.log', 'a')
logFile.write(msg)
logFile.close()
# based on http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
def findInPath(prog):
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, prog)
if os.path.exists(exe_file) and os.access(exe_file, os.X_OK):
return exe_file
return False
# based on http://mail.python.org/pipermail/python-list/2005-April/319818.html
def _reporthook(numblocks, blocksize, filesize, url=None):
#XXX Should handle possible filesize=-1.
try:
percent = min((numblocks*blocksize*100)/filesize, 100)
except:
percent = 100
if numblocks != 0:
sys.stdout.write("\b"*70)
sys.stdout.write("%-66s%3d%%" % (url, percent))
def geturl(url, dst):
downloader = SpringerURLopener()
if sys.stdout.isatty():
response = downloader.retrieve(url, dst,
lambda nb, bs, fs, url=url: _reporthook(nb,bs,fs,url), "MUD=MP")
sys.stdout.write("\n")
else:
response = downloader.retrieve(url, dst, None, "MUD=MP")
return response
def sanitizeFilename(filename):
p1 = subprocess.Popen(["echo", filename], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["iconv", "-f", "UTF-8", "-t" ,"ASCII//TRANSLIT"], stdin=p1.stdout, stdout=subprocess.PIPE)
return re.sub("\s+", "_", p2.communicate()[0].strip().replace("/", "-"))
# start program
if __name__ == "__main__":
main(sys.argv[1:])
# kate: indent-width 4; replace-tabs on;
excpected: it should downlaod the book
actual results: with command ./springer_download.py -c "978-3-662-54804-2" i get ERROR: Could not evaluate book title - bad link http://www.springerlink.com/content/978-3-662-54804-2/contents/
the test
python2 ./springer_download.py -c "978-3-662-54804-2"
does not work either
in the code above the error is in the context
match = re.search(r'<h2 class="MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControlName">([^<]+)</h2>', page)
if not match or match.group(1).strip() == "":
error("Could not evaluate book title - bad link?")
else:
bookTitle = match.group(1).strip()
print "\nThe book you are trying to download is called '%s'\n" % bookTitle
i would also be happy with alternatives like browser addons or the like. Using the example https://link.springer.com/book/10.1007/978-3-662-54805-9#toc
The following script is an extract from
https://github.com/RittmanMead/obi-metrics-agent/blob/master/obi-metrics-agent.py
The script is written in jython & it hits the weblogic admin console to extract metrics
The problem is it runs only once and does not loop infinitely
Here's the script that I've extracted from the original for my purpose:
import calendar, time
import sys
import getopt
print '---------------------------------------'
# Check the arguments to this script are as expected.
# argv[0] is script name.
argLen = len(sys.argv)
if argLen -1 < 2:
print "ERROR: got ", argLen -1, " args, must be at least two."
print '$FMW_HOME/oracle_common/common/bin/wlst.sh obi-metrics-agent.py <AdminUserName> <AdminPassword> [<AdminServer_t3_url>] [<Carbon|InfluxDB>] [<target host>] [<target port>] [targetDB influx db>'
exit()
outputFormat='CSV'
url='t3://localhost:7001'
targetHost='localhost'
targetDB='obi'
targetPort='8086'
try:
wls_user = sys.argv[1]
wls_pw = sys.argv[2]
url = sys.argv[3]
outputFormat=sys.argv[4]
targetHost=sys.argv[5]
targetPort=sys.argv[6]
targetDB=sys.argv[7]
except:
print ''
print wls_user, wls_pw,url, outputFormat,targetHost,targetPort,targetDB
now_epoch = calendar.timegm(time.gmtime())*1000
if outputFormat=='InfluxDB':
import httplib
influx_msgs=''
connect(wls_user,wls_pw,url)
results = displayMetricTables('Oracle_BI*','dms_cProcessInfo')
while True:
for table in results:
tableName = table.get('Table')
rows = table.get('Rows')
rowCollection = rows.values()
iter = rowCollection.iterator()
while iter.hasNext():
row = iter.next()
rowType = row.getCompositeType()
keys = rowType.keySet()
keyIter = keys.iterator()
inst_name= row.get('Name').replace(' ','-')
try:
server= row.get('Servername').replace(' ','-').replace('/','_')
except:
try:
server= row.get('ServerName').replace(' ','-').replace('/','_')
except:
server='unknown'
try:
host= row.get('Host').replace(' ','-')
except:
host=''
while keyIter.hasNext():
columnName = keyIter.next()
value = row.get(columnName )
if columnName.find('.value')>0:
metric_name=columnName.replace('.value','')
if value is not None:
if outputFormat=='InfluxDB':
influx_msg= ('%s,server=%s,host=%s,metric_group=%s,metric_instance=%s value=%s %s') % (metric_name,server,host,tableName,inst_name, value,now_epoch*1000000)
influx_msgs+='\n%s' % influx_msg
conn = httplib.HTTPConnection('%s:%s' % (targetHost,targetPort))
## TODO pretty sure should be urlencoding this ...
a=conn.request("POST", ("/write?db=%s" % targetDB), influx_msg)
r=conn.getresponse()
if r.status != 204:
print 'Failed to send to InfluxDB! Error %s Reason %s' % (r.status,r.reason)
print influx_msg
#sys.exit(2)
else:
print 'Skipping None value %s,server=%s,host=%s,metric_group=%s,metric_instance=%s value=%s %s' % (metric_name,server,host,tableName,inst_name, value,now_epoch*1000000)
I've tried to use the While loop, but that just stopped the code from exiting and not re-looping
What I want to achieve is to loop it infinitely post connection to weblogic
i.e. after this line
connect(wls_user,wls_pw,url)
and perhaps sleep for 5 seconds before re-running
Any and all help will be appreciated
Thanks
P
You can use this kind of condition for the loop :
mainLoop = 'true'
while mainLoop == 'true' :
and this for the pause between iterations :
java.lang.Thread.sleep(3 * 1000)
I have a code which requires to pass the latency, upspeed, dlspeed to another web site to display. Right now the code is as below
import datetime
import os
import sys
import shutil
import webbrowser
import tempfile
import subprocess
import json
import urllib.request
import statistics
import pymysql
import pymysql.cursors
IPERF3_WIN_PATH = "data/iperf3.exe"
HTML_TEMPLATE_PATH = "data/template.html"
IPERF3_HOST = "127.0.0.1"
RESULT_UPLOAD_URL = "UPLOAD URL"
RESULT_VIEW_URL = "VIEW URL"
def resource_path(relative_path):
""" Get absolute path to resource, works for dev and for PyInstaller
This is to get a path which will work with pyinstaller
"""
try:
# PyInstaller creates a temp folder and stores path in
# _MEIPASS
base_path = sys._MEIPASS
except Exception:
base_path = os.path.abspath(".")
return os.path.join(base_path, relative_path)
def ping(ip, tries):
""" Ping "ip" using the windows ping commmand
Return the average ping as a int
"""
res = 0
try:
output = subprocess.check_output(
["ping", "-n", str(tries), ip]).decode("utf-8")
res = int(output.split(" = ")[-1].split("ms")[0])
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Error while trying to ping the server, exiting")
else:
return res
def copyIperf3Exec():
""" On OSX :
Copy the iperf3 binary to a tmp file,
make it executable and return his path
This is to avoid many bundle related problems
On Windows, just return the package path """
return resource_path(IPERF3_WIN_PATH)
def get_iperf3_download():
""" Return the output of the iperf3 cli as a python dict """
ipf3_tmp = copyIperf3Exec()
try:
output = subprocess.check_output([ipf3_tmp,
"-c", IPERF3_HOST,
"-J",
"-P", "16",
"-w", "710000",
"-R"])
res_string = output.decode("utf-8")
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Problem while doing the test, please try again later")
else:
return json.loads(res_string)
def get_iperf3_upload():
""" Return the output of the iperf3 cli as a python dict """
ipf3_tmp = copyIperf3Exec()
try:
output = subprocess.check_output([ipf3_tmp,
"-c", IPERF3_HOST,
"-J",
"-P", "10",
"-w", "710000"])
res_string = output.decode("utf-8")
except subprocess.CalledProcessError:
input("Press Enter to Continue...")
sys.exit("Error while doing the upload test, please try again later")
else:
return json.loads(res_string)
def get_userinfos():
""" Get the 3 informations to be presented to the user
( ip, upload speed, download speed )
Return a Dictionary
"""
show_start_msg(0) # 0% Progress bar
avg_latency = ping(IPERF3_HOST, 5)
u_json = get_iperf3_upload()
show_start_msg(1) # 40%
d_json = get_iperf3_download()
show_start_msg(2) # 80%
ip = getip_apify()
u_bits_per_second = u_json['end']['sum_received']['bits_per_second']
d_bits_per_second = d_json['end']['sum_received']['bits_per_second']
u_testtime = u_json['end']['sum_received']['seconds']
d_testtime = d_json['end']['sum_received']['seconds']
u_testdate = u_json["start"]["timestamp"]["timesecs"]
d_testdate = d_json["start"]["timestamp"]["timesecs"]
res = {
'ip': ip,
'latency': avg_latency,
'upspeed': u_bits_per_second,
'dlspeed': d_bits_per_second,
'upspeedtime': u_testtime,
'dlspeedtime': d_testtime,
'upspeeddate': u_testdate,
'dlspeeddate': d_testdate
}
return res
def sendToDB(infos):
# Connect to the database
connection = pymysql.connect(host='127.0.0.1',
user='testclient',
password='password',
db='speed',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Create a new record
def stp_date(stp):
return datetime.datetime.fromtimestamp(stp).strftime(
'%Y-%m-%d %H:%M:%S')
sql = ("INSERT INTO `speedlog`"
"(`externalIP`, `uploadspeed`, `uploadspeedtime`,"
"`uploadspeeddate`, `downloadspeed`, `downloadspeedtime`,"
"`downloadspeeddate`, `latency`)"
"VALUES (%s, %s, %s, %s, %s, %s, %s, %s)")
cursor.execute(sql,
(infos["ip"],
str(int(infos["upspeed"])),
str("{0:.2f}".format(infos["upspeedtime"])),
stp_date(infos["upspeeddate"]),
str(int(infos["dlspeed"])),
str("{0:.2f}".format(infos["dlspeedtime"])),
stp_date(infos["dlspeeddate"]),
str(int(infos["latency"]))))
# connection is not autocommit by
# default. So you must commit to save
# your changes.
connection.commit()
finally:
connection.close()
return
def getip_apify():
res = urllib.request.urlopen("http://api.ipify.org")
raw_ip = res.read()
return raw_ip.decode('utf-8')
def prepare_template(templatePath, infos):
""" Load an html located at templatePath and replace the necessary text
with the associated values from the iPerf3 infos
Return a string
"""
f_template = open(templatePath)
s_template = f_template.read()
f_template.close()
mod_template = s_template.replace("avglatency", str(int(infos['latency'])))
mod_template = mod_template.replace(
"upspeed", str("{0:.3f}".format(infos['upspeed']/(1000*1000*1000))))
mod_template = mod_template.replace(
"dlspeed", str("{0:.3f}".format(infos['dlspeed']/(1000*1000*1000))))
return mod_template
def str_to_tempHtml(str):
""" Write "str" in an .html temporary file
And return his path
"""
data = bytes(str, "utf-8")
tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
tmp.write(data)
tmp.flush()
return tmp.name
def show_start_msg(progress):
if sys.platform.startswith('darwin'):
unused = os.system('clear')
elif sys.platform.startswith('win32'):
unused = os.system('cls')
print("="*70)
print("Speed Testing for 10G Network \n")
print("Powered by iPerf3")
print("="*70)
if progress == -1:
input("Press Enter to Continue...\n")
return
else:
print("Press Enter to Continue...\n")
print("Testing in progress")
if progress == 0:
print("[" + " "*68 + "]" + " 0%")
elif progress == 1:
print("[" + "#" * 27 + " " * 41 + "]" + " 40%")
elif progress == 2:
print("[" + "#" * 54 + " " * 14 + "]" + " 80%")
elif progress == 3:
print("[" + "#"*68 + "]" + " 100%")
print("Completed")
if __name__ == '__main__':
show_start_msg(-1)
infos = get_userinfos()
sendToDB(infos)
show_start_msg(3) # 100% Complete
data = { "key":"Jasdkjfhsda349*lio34sdfFdslaPisdf",
"download":"2048000",
"upload":"2048000",
"latency":"10"}
req = urllib.request.Request(RESULT_UPLOAD_URL, json.dumps(data).encode(
'ascii'))
req.add_header('Content-Type', 'application/json')
resp = urllib.request.urlopen(req).read().decode('ascii')
resp = resp.replace('\'', '"')
webbrowser.open(RESULT_VIEW_URL.format(json.loads(resp)['test_id']))
input("Press Enter to Continue...")
My latency, upspeed and dlspeed variables are stored as infos, and later sent over to the DB for recording via sendtoDB(infos).
The next part is to also pass these sets of variables to another web using RESTful, which in the data, the first attribute "key" is the REST key for authentication, followed by the rest of the values like latency, downloadspeed and uploadspeed. However, you can see that in the data, all the 3 variables are hard-coded value instead of the values derived from the test, which is latency, upspeedand dlspeed.
How can I modify the code to get these attributes instead of the hardcoded ones?
You have a method that returns this dictionary...
res = {
'ip': ip,
'latency': avg_latency,
'upspeed': u_bits_per_second,
'dlspeed': d_bits_per_second,
'upspeedtime': u_testtime,
'dlspeedtime': d_testtime,
'upspeeddate': u_testdate,
'dlspeeddate': d_testdate
}
And it is called infos, so use it
data = { "key":"xxxxxxxx",
"download":infos['dlspeed']
"upload":infos['upspeed'],
"latency":infos['latency']}
So the curl command I'm using is as follows:
cmd = "curl --write-out %{http_code} -X PUT -T " + self.basedir + putfile + " -# -o /dev/null " + self.uri + "/" + self.dist + "/" + putfile
I'd like to change this from invoking a system command to using pycurl. This way I can have more granular control over it and ultimately implement a progress bar for it. However, when I try and convert to python, my resulting script fails. Here is my efforts towards a python script:
f = open(filepath, "rb")
fs = os.path.getsize(filepath)
c = pycurl.Curl()
c.setopt(c.URL, target_url)
c.setopt(c.HTTPHEADER, ["User-Agent: Load Tool (PyCURL Load Tool)"])
c.setopt(c.PUT, 1)
c.setopt(c.READDATA, f)
c.setopt(c.INFILESIZE, int(fs))
c.setopt(c.NOSIGNAL, 1)
c.setopt(c.VERBOSE, 1)
c.body = StringIO()
c.setopt(c.WRITEFUNCTION, c.body.write)
try:
c.perform()
except:
import traceback
traceback.print_exc(file=sys.stderr)
sys.stderr.flush()
f.close()
c.close()
sys.stdout.write(".")
sys.stdout.flush()
Here's what that outputs:
* About to connect() to ************ port 8090 (#0)
* Trying 16.94.124.53... * connected
> PUT /incoming/ HTTP/1.1
Host: ***********
Accept: */*
User-Agent: Load Tool (PyCURL Load Tool)
Content-Length: 21
Expect: 100-continue
< HTTP/1.1 100 Continue
* We are completely uploaded and fine
< HTTP/1.1 500 Internal Server Error
< Content-type: text/html
* no chunk, no close, no size. Assume close to signal end
<
Thanks in advance for you help!
I've did uploading working module, you can find your answers looking in code.
And you can find almost all answers regarding pycurl by digging libcurl examples and Docs.
'''
Created on Oct 22, 2013
#author: me
'''
import pycurl
import os
import wx
import sys
import hashlib
from cStringIO import StringIO
def get_file_hash(full_filename):
BLOCKSIZE = 65536
hasher = hashlib.md5()
with open(full_filename, 'rb') as afile:
buf = afile.read(BLOCKSIZE)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(BLOCKSIZE)
return hasher.hexdigest()
class FtpUpload(object):
def __init__(self, server, username, password, **items):
self.server = server
self.username = username
self.password = password
self.gauge = items.get("gauge")
self.sb_speed = items.get("sb_speed")
self.upload_file_size = items.get("upload_file_size")
self.upload_file_speed = items.get("upload_file_speed")
self.filesize = 0
self.ftp_filehash = '0'
def sizeToNiceString(self, byteCount):
for (cutoff, label) in [(1024*1024*1024, "GB"), (1024*1024, "MB"), (1024, "KB")]:
if byteCount >= cutoff:
return "%.2f %s" % (byteCount * 1.0 / cutoff, label)
if byteCount == 1:
return "1 byte"
else:
return "%d bytes" % byteCount
def initRange(self, filesize):
self.filesize = filesize
self.gauge.SetRange(filesize)
def updateValue(self, upload_d):
upload_d_int = int(upload_d)
self.gauge.SetValue(upload_d_int)
upload_d_str = self.sizeToNiceString(upload_d)
upload_percent = int((upload_d*100)/self.filesize)
upload_d_status = "{0}/{1} ({2}%)".format(upload_d_str, self.sizeToNiceString(self.filesize), upload_percent)
self.sb_speed.SetStatusText(upload_d_status, 1)
self.upload_file_size.SetLabel(upload_d_status)
self.upload_file_speed.SetLabel(upload_d_str)
def progress(self, download_t, download_d, upload_t, upload_d):
self.updateValue(upload_d)
def test(self, debug_type, debug_msg):
if len(debug_msg) < 300:
print "debug(%d): %s" % (debug_type, debug_msg.strip())
def ftp_file_hash(self, buf):
sys.stderr.write("{0:.<20} : {1}\n".format('FTP RAW ', buf.strip()))
ftp_filehash = dict()
item = buf.strip().split('\n')[0]
ext = item.split('.')[1]
if len(ext) > 3:
ftp_filename = item[:-33]
ftp_filehash = item[-32:]
self.ftp_filehash = ftp_filehash
def get_ftp_file_hash(self, filename):
c = pycurl.Curl()
list_file_hash = 'LIST -1 ' + filename + "_*"
sys.stderr.write("{0:.<20} : {1} \n".format('FTP command ', list_file_hash))
c.setopt(pycurl.URL, self.server)
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.CUSTOMREQUEST, list_file_hash)
c.setopt(pycurl.WRITEFUNCTION, self.ftp_file_hash)
c.perform()
c.close()
def delete_ftp_hash_file(self, ftp_hash_file_old):
c = pycurl.Curl()
delete_hash_file = 'DELE ' + ftp_hash_file_old
sys.stderr.write("{0:.<20} : {1} \n".format('FTP command ', delete_hash_file))
c.setopt(pycurl.URL, self.server)
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.CUSTOMREQUEST, delete_hash_file)
try:
c.perform()
except Exception as e:
print e
c.close()
def upload(self, full_filename, filesize):
self.initRange(filesize)
filename = os.path.basename(full_filename)
sys.stderr.write("filename: %s\n" % full_filename)
c = pycurl.Curl()
c.setopt(pycurl.USERNAME, self.username)
c.setopt(pycurl.PASSWORD, self.password)
c.setopt(pycurl.VERBOSE, False)
c.setopt(pycurl.DEBUGFUNCTION, self.test)
c.setopt(pycurl.NOBODY, True)
c.setopt(pycurl.HEADER, False)
ftp_file_path = os.path.join(self.server, os.path.basename(full_filename))
file_hash = get_file_hash(full_filename)
ftp_hash_file = ftp_file_path + "_%s" % file_hash
# Getting filesize if exist on server.
try:
c.setopt(pycurl.URL, ftp_file_path)
c.perform()
filesize_offset = int(c.getinfo(pycurl.CONTENT_LENGTH_DOWNLOAD))
except Exception as error_msg:
print error_msg
wx.MessageBox(str(error_msg), 'Connection error!',
wx.OK | wx.ICON_ERROR)
# Exit upload function.
return True
ftp_file_append = True
# Get ftp file hash.
self.get_ftp_file_hash(filename)
offset = filesize_offset == -1 and '0' or filesize_offset
sys.stderr.write("L_file hash : {0:.<60}: {1:<40}\n".format(filename, file_hash))
sys.stderr.write("F_file hash : {0:.<60}: {1:<40}\n".format(filename, self.ftp_filehash))
sys.stderr.write("{0:15} : {1:.>15}\n".format('filesize:', filesize))
sys.stderr.write("{0:15} : {1:.>15}\n".format('ftp_filesize', offset))
sys.stderr.write("{0:15} : {1:.>15}\n".format('to upload:', filesize - int(offset)))
# File not exist on FTP server.
if filesize_offset == -1:
# file not exist: uploading from offset zero.
ftp_file_append = False
filesize_offset = 0
# Local and FTP file size and files MD5 are the same.
elif filesize_offset == self.filesize and file_hash == self.ftp_filehash:
sys.stderr.write("--- File exist on server! ---\n\n")
self.upload_file_speed.SetLabel("File exist on server!")
self.sb_speed.SetStatusText("File exist on server!", 1)
# Check next filename.
return False
# Ftp file and local file different data.
elif file_hash != self.ftp_filehash:
ftp_file_append = False
filesize_offset = 0
ftp_hash_file_old = filename + "_" + self.ftp_filehash
# delete old hash file.
self.delete_ftp_hash_file(ftp_hash_file_old)
c.setopt(pycurl.FTPAPPEND, ftp_file_append)
c.setopt(pycurl.UPLOAD, True)
c.setopt(pycurl.PROGRESSFUNCTION, self.progress)
with open('filehash.txt', 'w') as f:
f.write(file_hash)
for item in ("filehash.txt", full_filename):
# dont show progress by default.
noprogress = True
# upload ftp_hash_file first.
ftp_url = ftp_hash_file
with open(item, "rb") as f:
# chages ftp_url and show progress values, add filesize_offset.
if item != "filehash.txt":
f.seek(filesize_offset)
noprogress = False
ftp_url = ftp_file_path
c.setopt(pycurl.URL, ftp_url)
c.setopt(pycurl.NOPROGRESS, noprogress)
c.setopt(pycurl.READFUNCTION, f.read)
try:
c.perform()
if item != "filehash.txt":
sys.stderr.write("{0:15} : {1:.>15}\n\n".format("size uploaded", int(c.getinfo(pycurl.SIZE_UPLOAD))))
except Exception as error_msg:
print error_msg
wx.MessageBox(str(error_msg), 'Connection error!',
wx.OK | wx.ICON_ERROR)
# Exit upload function.
return True
self.ftp_filehash = '0'
c.close()
if __name__ == '__main__':
pass
I need to get the first 10 google results
for example:
... query = urllib.urlencode({'q' : 'example'})
...
... url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&%s' \
... % (query)
... search_results = urllib.urlopen(url)
... json = simplejson.loads(search_results.read())
... results = json['responseData']['results']
this will give me the results of the first page, but I`d like to get more google results, do anyone know how to do that?
I've done it in the past, here is complete example (i'm not python guru, but it works):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, getopt
import urllib
import simplejson
OPTIONS = ("m:", ["min="])
def print_usage():
s = "usage: " + sys.argv[0] + " "
for o in OPTIONS[0]:
if o != ":" : s += "[-" + o + "] "
print(s + "query_string\n")
def search(query, index, offset, min_count, quiet=False, rs=[]):
url = "http://ajax.googleapis.com/ajax/services/search/web?v=1.0&rsz=large&%s&start=%s" % (query, offset)
result = urllib.urlopen(url)
json = simplejson.loads(result.read())
status = json["responseStatus"]
if status == 200:
results = json["responseData"]["results"]
cursor = json["responseData"]["cursor"]
pages = cursor["pages"]
for r in results:
i = results.index(r) + (index -1) * len(results) + 1
u = r["unescapedUrl"]
rs.append(u)
if not quiet:
print("%3d. %s" % (i, u))
next_index = None
next_offset = None
for p in pages:
if p["label"] == index:
i = pages.index(p)
if i < len(pages) - 1:
next_index = pages[i+1]["label"]
next_offset = pages[i+1]["start"]
break
if next_index != None and next_offset != None:
if int(next_offset) < min_count:
search(query, next_index, next_offset, min_count, quiet, rs)
return rs
def main():
min_count = 64
try:
opts, args = getopt.getopt(sys.argv[1:], *OPTIONS)
for opt, arg in opts:
if opt in ("-m", "--min"):
min_count = int(arg)
assert len(args) > 0
except:
print_usage()
sys.exit(1)
qs = " ".join(args)
query = urllib.urlencode({"q" : qs})
search(query, 1, "0", min_count)
if __name__ == "__main__":
main()
Edit: i've fixed obvious command-line options mishandling; you can call this script as follows:
python gsearch.py --min=5 vanessa mae
--min switch means "at least 5 results" and is optional, you will get maximum allowed result count (64) if it is not specified.
Also, error handling is omitted for brevity.
See docs http://code.google.com/apis/websearch/docs/reference.html#_intro_fonje
You’re looking for the start parameter.
There’s no parameter to get more results in one response, but you can iterate via the start parameter.