Related
Edited
Q)Can someone help getting the values inserted into mysql database , just confused where place mydb function
Reason :Once I manually enter cntrl+c for .py , then only the values are getting inserted into mysql database
Used in the .py file
here is the complete code , where should i place the mydb function?
Table values not getting inserted into mysql database until cntrl+c is entered to close python file in linux
import os
import re
from builtins import len, Exception
import slack
import logging
from subprocess import check_output
import datetime
import mysql.connector
import time
import json
import requests
#user_threads_info = {}
#thread_ts = ""
#slack.RTMClient.run_on(event='message')
def say_hello(**payload):
try:
##0 get clients and payload
logging.info('msg received')
data = payload['data']
web_client = payload['web_client']
rtm_client = payload['rtm_client']
##0 - 1 Check if it is the first msg, not replied msg by me
# print(data)
if data.get('text') == None:
logging.info('This msg is my replied msg.')
return False
##0-2 Get channel info
channel_id = data['channel']
thread_ts = data['ts']
global user
user = data['user']
#user_info = get_userinfo(user)
#print(user_info)
msg = data['text']
##1 get scenario submsg
retVal = analysis_msg(msg)
# print(retVal)
response = web_client.users_list()
assert(response['ok'])
user_map = {x['id']: x['name'] for x in response['members']}
global user_name
user_name = user_map[user] if user in user_map else None
print(user_name)
if retVal[0] == False:
retMsg = retVal[1] + "\nI can create the following orders. \n" \
"a) spu - store pickup \n" \
"b) sth - ship to home \n" \
"c) del - delivery \n" \
"d) digitalAsGuest - Digital item \n" \
" \n" \
"Please provide information as mentioned in below example.\n" \
" \n" \
"Example: spu:3646989:sftqa3:AMEX\n" \
"\n" \
"Sample SKUS:\n" \
"spu - [3646989,8862011]\n" \
"sth - [2592015,6140094]\n" \
"del - [5592005,8862011]\n" \
"digitalAsGuest - [2810037,5057400]"
send_msg(web_client, channel_id, thread_ts, user, retMsg)
return False
##2 form cmd
retVal = form_cmd(retVal[1])
print(retVal)
if retVal == False:
return False
##3 execute cmd
# inform the start of test
retMsg = "Creating an order,Please wait for the result."
send_msg(web_client, channel_id, thread_ts, user, retMsg)
global res
try:
res1 = os.popen(retVal).read()
print("Printing result...")
print(res1)
print("end of print")
res = reg_result_new(res1)
if res == False:
print("reg_function failure")
retMsg = "The test order placement failed."
else:
retMsg = "Order Id - " + res['id'] + "\nFirst Name - " + res['firstName'] + "\nLast Name - " + res['lastName'] + "\n PhoneNumber - " + res['dayPhoneNumber'] + "\n Email - " + res['email'] + "\n"
except Exception as ee:
retMsg = "The test scenario has a failure. Please Check the feature file."
## 4 send result to slack
# retMsg = "Order Id - " + res['id'] + "\nFirst Name - " + res['firstName'] + "\nLast Name - " + res['lastName'] + "\n PhoneNumber - " + res['day PhoneNumber'] + "\n Email - " + res['email'] + "\n"
create_result_file(user, res)
send_msg(web_client, channel_id, thread_ts, user, retMsg)
print(retVal)
except Exception as e:
print("error")
logging.critical(str(e))
############################ My handlers ##############################
def create_result_file(user, res):
try:
cur_time = datetime.datetime.now()
file_name = user + str(cur_time.year) + str(cur_time.month) + str(cur_time.day) + str(cur_time.hour) + str(
cur_time.minute) + str(cur_time.second) + '.txt'
file = open(file_name, 'w')
file.write(res)
file.close()
except Exception as e:
print(str(e))
def send_msg(web_client, channel_id, thread_ts,user,mgs):
print("thread_ts value is:"+thread_ts)
web_client.chat_postMessage(
channel=channel_id,
text=f"```Hi <#{user}>! \n " + mgs + "```",
thread_ts=thread_ts
)
#def get_userinfo(user):
# payload = {'token': slack_token, 'user': user}
# r = requests.get('https://slack.com/api/users.info', params=payload)
# print(r.text)
# return json.loads(r.text)["user"]
# error code mgmt.
def error_code(code):
# reserved
print(code)
return [False, code]
# break down msg to the test scenario submsgs
def analysis_msg(msg):
global submsg
submsg = msg.split(":")
for value in submsg:
print(value)
if len(submsg) != 4:
logging.warning("This msg not test scenario")
return error_code("Please check the format")
res = {}
res["feature"] = submsg[0]
res["sku"] = submsg[1]
res["env"] = submsg[2]
res["payment"] = submsg[3]
###check
if validate_sku(res["sku"]) == False:
return error_code("INVALID_SKU \n")
if validate_env(res["env"]) == False:
return error_code("INVALID_ENV \n")
if validate_payment(res["payment"]) == False:
return error_code("INVALID_payment \n")
if check_specialCharacter(res["feature"]) == False:
return error_code("INVALID_PROFILE_WITH_SPECIAL_CHARACTER")
return [True, res]
# form cmd for test bat files ! reserved
def form_cmd(submsg):
cmd = 'sh /home/iptbot/iptautobot/test.sh ' + submsg['env'] + ' ' + submsg['feature'] + ' ' + submsg["sku"] + ' ' + submsg["payment"]
return cmd
#code to print user details
#code to print user details
def reg_result_new(res):
start = 'COP Order Response :'
end = 'isGuestMode'
start_index = res.find(start) + len(start)
res = res[start_index:]
end_index = res.find(end) + 22
global data
data = res[:end_index]
try:
print('Data -> ' + str(data))
data = json.loads(data.strip())
new_data = {}
new_data['id'] = data['id']
new_data['firstName'] = data['lineItems'][0]['fulfillmentInfo']['storeInfo']['agentInfo']['firstName']
new_data['lastName'] = data['lineItems'][0]['fulfillmentInfo']['storeInfo']['agentInfo']['lastName']
new_data['dayPhoneNumber'] = data['lineItems'][0]['fulfillmentInfo']['storeInfo']['agentInfo']['dayPhoneNumber']
new_data['email'] = data['lineItems'][0]['fulfillmentInfo']['storeInfo']['agentInfo']['email']
#new_data['firstName'] = data['paymentInfo']['billingAddressInfo']['firstName']
return new_data
except Exception as e:
print('Here error -> '+str(e))
return False
#def reg_result(res):
# "COP Order Response"
# lines = res.split('\n')
# for line in lines:
# pattern = "COP Order Response*"
# prog = re.compile(pattern)
# result = prog.search(line)
# if result == None:
# continue
# res1 = result.string.split('{')
# if len(res1) < 2:
# continue
# res2 = res1[1].split(',')
# if len(res2) < 2:
# continue
# res3 = res2[0].split(':')
# if len(res3) < 2:
# continue
# return res3[1]
# COP Order Response : {"id":"BBY01-200001878853"
# return False
# return val is Boolean
# True/False
# Input type: String
# for positive integer only
# alternative way: Handle exception for int(d)
def validate_sku(sku_val):
return sku_val.isnumeric()
# input val : string
# return val: Boolean
def validate_env(env_val):
env_list = [
"sftqa1" , "sftqa2" , "sftqa3" , "sftqa4"
]
if env_val in env_list:
return True
else:
return False
def validate_payment(payment_val):
env_payment = [
"AMEX","VISA"
]
if payment_val in env_payment:
return True
else:
return False
# input val : string
# return val: Boolean
def check_specialCharacter(s):
if s == "":
return False
if s.isspace():
return False
return s.isalnum()
slack_token = os.environ["SLACK_API_TOKEN"]
rtm_client = slack.RTMClient(token=slack_token)
rtm_client.start()
#database connction
mydb = mysql.connector.connect(
host="host",
user="user",
passwd="pass",
database="db"
)
mycursor = mydb.cursor()
for value in submsg:
print(value)
fulfilment=submsg[0]
sku=submsg[1]
environment=submsg[2]
payment=submsg[3]
ts = time.time()
date = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
orderNumber=data['id']
username=user_name
print(fulfilment)
print(sku)
print(environment)
print(payment)
print(username)
print(orderNumber)
sqlformula = "INSERT INTO orderDetails (fulfilment,sku,environment,payment,orderNumber,date,user) VALUES (%s,%s,%s,%s,%s,%s,%s)"
#order=("sth",3643387,"sftqa2","AMEX")
#mycursor.execute(sqlformula,order)
mycursor.execute(sqlformula,(fulfilment,sku,environment,payment,orderNumber,date,username))
mydb.commit()
mydb.close()
Output
1 sh /home/iptbot/iptautobot/test.sh sftqa3 spu 3646989 AMEX
2 error
3 CRITICAL:root:'user'
4 error
5 CRITICAL:root:'user' // clicking Control+C values get inserted
6 ^CWARNING:slack.rtm.client:Websocket was closed.
7 3646989
8 sftqa3
9 AMEX
10 spu
11 3646989
12 sftqa3
13 AMEX
14 a6002043
15 BBY01-200002091354
You are stuck at this point because rtm_client.start() is a synchronous call.
If you want it to be asynchronous (non-blocking) then you should run:
rtm_client.start(run_async=True)
Here it is good walk-through on how to setup async usage of the library. Also have a look at the method signature for RTMClient to get an idea of how it works.
Here's a good example detailing a lot of what you would need in your case.
Then you will hit your db execution code where you will need to have a while loop to go through the data you want to add to the DB.
I would recommend that you use a Queue for this as it is synchronised and will be easier to manage than a global list which is overwritten on every order. Preferably you could use asyncio.Queue with an example of implementation here
When an order has passed the validation steps add it to the queue. Here is some pseudo code describing the flow with a basic (not asyncio) Queue:
import queue
q = queue.Queue()
def validate_order(order):
valid_order_data = ......
q.put(valid_order_data)
while True:
valid_order = q.get() # Will wait until there is a value on the queue
mycursor.execute(sqlformula, (valid_order))
springerlink has changed its structure, and now the script doesn't work anymore. With it you should could download all chapters at once instead of all single chapters.
i installed the script and its dependencies with linux.
from here http://milianw.de/code-snippets/take-2-download-script-for-springerlinkcom-ebooks and here https://github.com/milianw/springer_download
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import getopt
import urllib
import re
import tempfile
import shutil
import subprocess
# Set some kind of User-Agent so we don't get blocked by SpringerLink
class SpringerURLopener(urllib.FancyURLopener):
version = "Mozilla 5.0"
def pdfcat(fileList, bookTitlePath):
if findInPath("pdftk") != False:
command = [findInPath("pdftk")]
command.extend(fileList)
command.extend(["cat", "output", bookTitlePath])
subprocess.Popen(command, shell=False).wait()
elif findInPath("stapler") != False:
command = [findInPath("stapler"), "cat"]
command.extend(fileList)
command.append(bookTitlePath)
subprocess.Popen(command, shell=False).wait()
else:
error("You have to install pdftk (http://www.accesspdf.com/pdftk/) or stapler (http://github.com/hellerbarde/stapler).")
# validate CLI arguments and start downloading
def main(argv):
if not findInPath("iconv"):
error("You have to install iconv.")
#Test if convert is installed
if os.system("convert --version > /dev/null 2>&1")!=0:
error("You have to install the packet ImageMagick in order to use convert")
try:
opts, args = getopt.getopt(argv, "hl:c:n", ["help", "link=", "content=", "no-merge"])
except getopt.GetoptError:
error("Could not parse command line arguments.")
link = ""
hash = ""
merge = True
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-c", "--content"):
if link != "":
usage()
error("-c and -l arguments are mutually exclusive")
hash = arg
elif opt in ("-l", "--link"):
if hash != "":
usage()
error("-c and -l arguments are mutually exclusive")
match = re.match("(https?://)?(www\.)?springer(link)?.(com|de)/(content|.*book)/(?P<hash>[a-z0-9\-]+)/?(\?[^/]*)?$", arg)
if not match:
usage()
error("Bad link given. See example link.")
hash = match.group("hash")
elif opt in ("-n", "--no-merge"):
merge = False
if hash == "":
usage()
error("Either a link or a hash must be given.")
if merge and not findInPath("pdftk") and not findInPath("stapler"):
error("You have to install pdftk (http://www.accesspdf.com/pdftk/) or stapler (http://github.com/hellerbarde/stapler).")
baseLink = "http://www.springerlink.com/content/" + hash + "/"
link = baseLink + "contents/"
chapters = list()
loader = SpringerURLopener();
curDir = os.getcwd()
bookTitle = ""
coverLink = ""
front_matter = False
while True:
# download page source
try:
print "fetching book information...\n\t%s" % link
page = loader.open(link,"MUD=MP").read()
except IOError, e:
error("Bad link given (%s)" % e)
if re.search(r'403 Forbidden', page):
error("Could not access page: 403 Forbidden error.")
if bookTitle == "":
match = re.search(r'<h1[^<]+class="title">(.+?)(?:<br/>\s*<span class="subtitle">(.+?)</span>\s*)?</h1>', page, re.S)
if not match or match.group(1).strip() == "":
error("Could not evaluate book title - bad link %s" % link)
else:
bookTitle = match.group(1).strip()
# remove tags, e.g. <sub>
bookTitle = re.sub(r'<[^>]*?>', '', bookTitle)
# subtitle
if match and match.group(2) and match.group(2).strip() != "":
bookTitle += " - " + match.group(2).strip()
# edition
#match = re.search(r'<td class="labelName">Edition</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " " + match.group(1).strip()
## year
#match = re.search(r'<td class="labelName">Copyright</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " " + match.group(1).strip()
## publisher
#match = re.search(r'<td class="labelName">Publisher</td><td class="labelValue">([^<]+)</td>', page)
#if match:
#bookTitle += " - " + match.group(1).strip()
# coverimage
match = re.search(r'<div class="coverImage" title="Cover Image" style="background-image: url\(/content/([^/]+)/cover-medium\.gif\)">', page)
if match:
coverLink = "http://www.springerlink.com/content/" + match.group(1) + "/cover-large.gif"
bookTitlePath = curDir + "/%s.pdf" % sanitizeFilename(bookTitle)
if bookTitlePath == "":
error("could not transliterate book title %s" % bookTitle)
if os.path.isfile(bookTitlePath):
error("%s already downloaded" % bookTitlePath)
print "\nNow Trying to download book '%s'\n" % bookTitle
#error("foo")
# get chapters
for match in re.finditer('href="([^"]+\.pdf)"', page):
chapterLink = match.group(1)
if chapterLink[:7] == "http://": # skip external links
continue
if re.search(r'front-matter.pdf', chapterLink):
if front_matter:
continue
else:
front_matter = True
if re.search(r'back-matter.pdf', chapterLink) and re.search(r'<a href="([^"#]+)"[^>]*>Next</a>', page):
continue
#skip backmatter if it is in list as second chapter - will be there at the end of the book also
if re.search(r'back-matter.pdf', chapterLink):
if len(chapters)<2:
continue
chapters.append(chapterLink)
# get next page
match = re.search(r'<a href="([^"#]+)"[^>]*>Next</a>', page)
if match:
link = "http://www.springerlink.com" + match.group(1).replace("&", "&")
else:
break
if len(chapters) == 0:
error("No chapters found - bad link?")
print "found %d chapters" % len(chapters)
# setup; set tempDir as working directory
tempDir = tempfile.mkdtemp()
os.chdir(tempDir)
i = 1
fileList = list()
for chapterLink in chapters:
if chapterLink[0] == "/":
chapterLink = "http://www.springerlink.com" + chapterLink
else:
chapterLink = baseLink + chapterLink
chapterLink = re.sub("/[^/]+/\.\.", "", chapterLink)
print "downloading chapter %d/%d" % (i, len(chapters))
localFile, mimeType = geturl(chapterLink, "%d.pdf" % i)
if mimeType.gettype() != "application/pdf":
os.chdir(curDir)
shutil.rmtree(tempDir)
error("downloaded chapter %s has invalid mime type %s - are you allowed to download %s?" % (chapterLink, mimeType.gettype(), bookTitle))
fileList.append(localFile)
i += 1
if coverLink != "":
print "downloading front cover from %s" % coverLink
localFile, mimeType = geturl(coverLink, "frontcover")
if os.system("convert %s %s.pdf" % (localFile, localFile)) == 0:
fileList.insert(0, localFile + ".pdf")
if merge:
print "merging chapters"
if len(fileList) == 1:
shutil.move(fileList[0], bookTitlePath)
else:
pdfcat(fileList, bookTitlePath)
# cleanup
os.chdir(curDir)
shutil.rmtree(tempDir)
print "book %s was successfully downloaded, it was saved to %s" % (bookTitle, bookTitlePath)
log("downloaded %s chapters (%.2fMiB) of %s\n" % (len(chapters), os.path.getsize(bookTitlePath)/2.0**20, bookTitle))
else: #HL: if merge=False
print "book %s was successfully downloaded, unmerged chapters can be found in %s" % (bookTitle, tempDir)
log("downloaded %s chapters of %s\n" % (len(chapters), bookTitle))
sys.exit()
# give a usage message
def usage():
print """Usage:
%s [OPTIONS]
Options:
-h, --help Display this usage message
-l LINK, --link=LINK defines the link of the book you intend to download
-c ISBN, --content=ISBN builds the link from a given ISBN (see below)
-n, --no-merge Only download the chapters but don't merge them into a single PDF.
You have to set exactly one of these options.
LINK:
The link to your the detail page of the ebook of your choice on SpringerLink.
It lists book metadata and has a possibly paginated list of the chapters of the book.
It has the form:
http://www.springerlink.com/content/ISBN/STUFF
Where: ISBN is a string consisting of lower-case, latin chars and numbers.
It alone identifies the book you intent do download.
STUFF is optional and looks like #section=... or similar. It will be stripped.
""" % os.path.basename(sys.argv[0])
# raise an error and quit
def error(msg=""):
if msg != "":
log("ERR: " + msg + "\n")
print "\nERROR: %s\n" % msg
sys.exit(2)
return None
# log to file
def log(msg=""):
logFile = open('springer_download.log', 'a')
logFile.write(msg)
logFile.close()
# based on http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
def findInPath(prog):
for path in os.environ["PATH"].split(os.pathsep):
exe_file = os.path.join(path, prog)
if os.path.exists(exe_file) and os.access(exe_file, os.X_OK):
return exe_file
return False
# based on http://mail.python.org/pipermail/python-list/2005-April/319818.html
def _reporthook(numblocks, blocksize, filesize, url=None):
#XXX Should handle possible filesize=-1.
try:
percent = min((numblocks*blocksize*100)/filesize, 100)
except:
percent = 100
if numblocks != 0:
sys.stdout.write("\b"*70)
sys.stdout.write("%-66s%3d%%" % (url, percent))
def geturl(url, dst):
downloader = SpringerURLopener()
if sys.stdout.isatty():
response = downloader.retrieve(url, dst,
lambda nb, bs, fs, url=url: _reporthook(nb,bs,fs,url), "MUD=MP")
sys.stdout.write("\n")
else:
response = downloader.retrieve(url, dst, None, "MUD=MP")
return response
def sanitizeFilename(filename):
p1 = subprocess.Popen(["echo", filename], stdout=subprocess.PIPE)
p2 = subprocess.Popen(["iconv", "-f", "UTF-8", "-t" ,"ASCII//TRANSLIT"], stdin=p1.stdout, stdout=subprocess.PIPE)
return re.sub("\s+", "_", p2.communicate()[0].strip().replace("/", "-"))
# start program
if __name__ == "__main__":
main(sys.argv[1:])
# kate: indent-width 4; replace-tabs on;
excpected: it should downlaod the book
actual results: with command ./springer_download.py -c "978-3-662-54804-2" i get ERROR: Could not evaluate book title - bad link http://www.springerlink.com/content/978-3-662-54804-2/contents/
the test
python2 ./springer_download.py -c "978-3-662-54804-2"
does not work either
in the code above the error is in the context
match = re.search(r'<h2 class="MPReader_Profiles_SpringerLink_Content_PrimitiveHeadingControlName">([^<]+)</h2>', page)
if not match or match.group(1).strip() == "":
error("Could not evaluate book title - bad link?")
else:
bookTitle = match.group(1).strip()
print "\nThe book you are trying to download is called '%s'\n" % bookTitle
i would also be happy with alternatives like browser addons or the like. Using the example https://link.springer.com/book/10.1007/978-3-662-54805-9#toc
I tried to run a simple crawler in Python:
import sys
import csv
import socket
import sqlite3
import logging
from optparse import OptionParser
from urlparse import urlparse
#pip install requests
import requests
#################################################################
# FUNCTION process_row_to_db.
# handle one row and push to the DB
#
#################################################################
def process_row_to_db(conn, data_row, comment, hostname):
insert_stmt = "INSERT OR IGNORE INTO adstxt (SITE_DOMAIN, EXCHANGE_DOMAIN, SELLER_ACCOUNT_ID, ACCOUNT_TYPE, TAG_ID, ENTRY_COMMENT) VALUES (?, ?, ?, ?, ?, ? );"
exchange_host = ''
seller_account_id = ''
account_type = ''
tag_id = ''
if len(data_row) >= 3:
exchange_host = data_row[0].lower()
seller_account_id = data_row[1].lower()
account_type = data_row[2].lower()
if len(data_row) == 4:
tag_id = data_row[3].lower()
#data validation heurstics
data_valid = 1;
# Minimum length of a domain name is 1 character, not including extensions.
# Domain Name Rules - Nic AG
# www.nic.ag/rules.htm
if(len(hostname) < 3):
data_valid = 0
if(len(exchange_host) < 3):
data_valid = 0
# could be single digit integers
if(len(seller_account_id) < 1):
data_valid = 0
## ads.txt supports 'DIRECT' and 'RESELLER'
if(len(account_type) < 6):
data_valid = 0
if(data_valid > 0):
logging.debug( "%s | %s | %s | %s | %s | %s" % (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Insert a row of data using bind variables (protect against sql injection)
c = conn.cursor()
c.execute(insert_stmt, (hostname, exchange_host, seller_account_id, account_type, tag_id, comment))
# Save (commit) the changes
conn.commit()
return 1
return 0
# end process_row_to_db #####
#################################################################
# FUNCTION crawl_to_db.
# crawl the URLs, parse the data, validate and dump to a DB
#
#################################################################
def crawl_to_db(conn, crawl_url_queue):
rowcnt = 0
myheaders = {
'User-Agent': 'AdsTxtCrawler/1.0; +https://github.com/InteractiveAdvertisingBureau/adstxtcrawler',
'Accept': 'text/plain',
}
for aurl in crawl_url_queue:
ahost = crawl_url_queue[aurl]
logging.info(" Crawling %s : %s " % (aurl, ahost))
r = requests.get(aurl, headers=myheaders)
logging.info(" %d" % r.status_code)
if(r.status_code == 200):
logging.debug("-------------")
logging.debug(r.request.headers)
logging.debug("-------------")
logging.debug("%s" % r.text)
logging.debug("-------------")
tmpfile = 'tmpads.txt'
with open(tmpfile, 'wb') as tmp_csv_file:
tmp_csv_file.write(r.text)
tmp_csv_file.close()
with open(tmpfile, 'rb') as tmp_csv_file:
#read the line, split on first comment and keep what is to the left (if any found)
line_reader = csv.reader(tmp_csv_file, delimiter='#', quotechar='|')
comment = ''
for line in line_reader:
logging.debug("DATA: %s" % line)
try:
data_line = line[0]
except:
data_line = "";
#determine delimiter, conservative = do it per row
if data_line.find(",") != -1:
data_delimiter = ','
elif data_line.find("\t") != -1:
data_delimiter = '\t'
else:
data_delimiter = ' '
data_reader = csv.reader([data_line], delimiter=',', quotechar='|')
for row in data_reader:
if len(row) > 0 and row[0].startswith( '#' ):
continue
if (len(line) > 1) and (len(line[1]) > 0):
comment = line[1]
rowcnt = rowcnt + process_row_to_db(conn, row, comment, ahost)
return rowcnt
# end crawl_to_db #####
#################################################################
# FUNCTION load_url_queue
# Load the target set of URLs and reduce to an ads.txt domains queue
#
#################################################################
def load_url_queue(csvfilename, url_queue):
cnt = 0
with open(csvfilename, 'rb') as csvfile:
targets_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in targets_reader:
if len(row) < 1 or row[0].startswith( '#' ):
continue
for item in row:
host = "localhost"
if "http:" in item or "https:" in item :
logging.info( "URL: %s" % item)
parsed_uri = urlparse(row[0])
host = parsed_uri.netloc
else:
host = item
logging.info( "HOST: %s" % item)
skip = 0
try:
#print "Checking DNS: %s" % host
ip = socket.gethostbyname(host)
if "127.0.0" in ip:
skip = 0 #swap to 1 to skip localhost testing
elif "0.0.0.0" in ip:
skip = 1
else:
logging.info(" Validated Host IP: %s" % ip)
except:
skip = 1
if(skip < 1):
ads_txt_url = 'http://{thehost}/ads.txt'.format(thehost=host)
logging.info(" pushing %s" % ads_txt_url)
url_queue[ads_txt_url] = host
cnt = cnt + 1
return cnt
# end load_url_queue #####
#### MAIN ####
arg_parser = OptionParser()
arg_parser.add_option("-t", "--targets", dest="target_filename",
help="list of domains to crawl ads.txt from", metavar="FILE")
arg_parser.add_option("-d", "--database", dest="target_database",
help="Database to dump crawled data into", metavar="FILE")
arg_parser.add_option("-v", "--verbose", dest="verbose", action='count',
help="Increase verbosity (specify multiple times for more)")
(options, args) = arg_parser.parse_args()
if len(sys.argv)==1:
arg_parser.print_help()
exit(1)
log_level = logging.WARNING # default
if options.verbose == 1:
log_level = logging.INFO
elif options.verbose >= 2:
log_level = logging.DEBUG
logging.basicConfig(filename='adstxt_crawler.log',level=log_level,format='%(asctime)s %(filename)s:%(lineno)d:%(levelname)s %(message)s')
crawl_url_queue = {}
conn = None
cnt_urls = 0
cnt_records = 0
cnt_urls = load_url_queue(options.target_filename, crawl_url_queue)
if (cnt_urls > 0) and options.target_database and (len(options.target_database) > 1):
conn = sqlite3.connect(options.target_database)
with conn:
cnt_records = crawl_to_db(conn, crawl_url_queue)
if(cnt_records > 0):
conn.commit()
#conn.close()
print "Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database)
logging.warning("Wrote %d records from %d URLs to %s" % (cnt_records, cnt_urls, options.target_database))
logging.warning("Finished.")
I'm using Python 2.7.9.
I tried to install sqlite with this command:
python -m pip install sqlite
I got back this:
Downloading/unpacking sqlite3 Could not find any downloads that
satisfy the requirement sqlite3 Cleaning up... No distributions at all
found for sqlite3 Storing debug log for failure in ...\pip.log
First step would be this command:
$sqlite3 adstxt.db < adstxt_crawler.sql
I got these:
"'sqlite3' is not recognized as an internal or external command, operable program or batch file."
I know it's very basic, but I haven't found any relevant help, if you could help me, I really apprecitiate it.
Thanks.
Adam
The first error:
'sqlite3' is not recognized as an internal or external command, operable program or batch file.
Is because you try to run sqlite command line tool, which is not installed on your system. Python 3 includes sqlite but does not provide the standalone command sqlite3
The second error is a syntax error. In Python 3, print is a standard function, so must be used with parenthesis
print('hello world')
You probably tried to run python 2 code with Python 3 interpreter
I am using pynetdicom script to fetch the data from the dcm4chee.
to run the script I need to pass the arguments from the command line.
But I already have that value in some variable or in other object and I need to use from there but I am not getting how can I pass that value to the parser or Is it possible to do without parsing.
Please help me to know how can I pass the value using some variables instead of passing through the command line.
Script :
#!/usr/bin/python
"""
For help on usage,
python qrscu.py -h
"""
import argparse
from netdicom.applicationentity import AE
from netdicom.SOPclass import *
from dicom.dataset import Dataset, FileDataset
from dicom.UID import ExplicitVRLittleEndian, ImplicitVRLittleEndian, ExplicitVRBigEndian
import netdicom
import tempfile
# parse commandline
parser = argparse.ArgumentParser(description='storage SCU example')
print "parser", parser
parser.add_argument('remotehost')
parser.add_argument('remoteport', type=int)
parser.add_argument('searchstring')
parser.add_argument('-p', help='local server port', type=int, default=9999)
parser.add_argument('-aet', help='calling AE title', default='PYNETDICOM')
parser.add_argument('-aec', help='called AE title', default='REMOTESCU')
parser.add_argument('-implicit', action='store_true', help='negociate implicit transfer syntax only', default=False)
parser.add_argument('-explicit', action='store_true', help='negociate explicit transfer syntax only', default=False)
args = parser.parse_args()
print "args :::: ", type(args), args
if args.implicit:
ts = [ImplicitVRLittleEndian]
elif args.explicit:
ts = [ExplicitVRLittleEndian]
else:
ts = [
ExplicitVRLittleEndian,
ImplicitVRLittleEndian,
ExplicitVRBigEndian
]
# call back
def OnAssociateResponse(association):
print "Association response received"
def OnAssociateRequest(association):
print "Association resquested"
return True
def OnReceiveStore(SOPClass, DS):
print "Received C-STORE", DS.PatientName
try:
# do something with dataset. For instance, store it.
file_meta = Dataset()
file_meta.MediaStorageSOPClassUID = '1.2.840.10008.5.1.4.1.1.2'
file_meta.MediaStorageSOPInstanceUID = "1.2.3" # !! Need valid UID here
file_meta.ImplementationClassUID = "1.2.3.4" # !!! Need valid UIDs here
filename = '%s/%s.dcm' % (tempfile.gettempdir(), DS.SOPInstanceUID)
ds = FileDataset(filename, {}, file_meta=file_meta, preamble="\0" * 128)
ds.update(DS)
ds.save_as(filename)
print "File %s written" % filename
except:
pass
# must return appropriate status
return SOPClass.Success
# create application entity with Find and Move SOP classes as SCU and
# Storage SOP class as SCP
MyAE = AE(args.aet, args.p, [PatientRootFindSOPClass,
PatientRootMoveSOPClass,
VerificationSOPClass], [StorageSOPClass], ts)
MyAE.OnAssociateResponse = OnAssociateResponse
MyAE.OnAssociateRequest = OnAssociateRequest
MyAE.OnReceiveStore = OnReceiveStore
MyAE.start()
# remote application entity
RemoteAE = dict(Address=args.remotehost, Port=args.remoteport, AET=args.aec)
# create association with remote AE
print "Request association"
assoc = MyAE.RequestAssociation(RemoteAE)
# perform a DICOM ECHO
print "DICOM Echo ... ",
if assoc:
st = assoc.VerificationSOPClass.SCU(1)
print 'done with status "%s"' % st
print "DICOM FindSCU ... ",
print "\n\n----------------------------------------------------------------------\n\n"
d = Dataset()
d.StudyDate = args.searchstring
d.QueryRetrieveLevel = "STUDY"
d.PatientID = "*"
study = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
print 'done with status "%s"' % st
print "\n\n\n Cont...", study
print "\n\n----------------------------------------------------------------------\n\n"
# loop on patients
for pp in study:
print "\n\n----------------------Pateint Detals------------------------------------------------\n\n"
print "%s - %s" % (pp.StudyDate, pp.PatientID)
# find studies
d = Dataset()
d.PatientID = pp.PatientID
d.QueryRetrieveLevel = "STUDY"
d.PatientName = ""
d.StudyInstanceUID = ""
d.StudyDate = ""
d.StudyTime = ""
d.StudyID = ""
d.ModalitiesInStudy = ""
d.StudyDescription = ""
studies = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
# loop on studies
for st in studies:
print "\n study :: ", studies
print "\n\n---------------------------Study---------------------------\n\n"
print " %s - %s %s" % (st.StudyDescription, st.StudyDate, st.StudyTime)
d = Dataset()
d.QueryRetrieveLevel = "SERIES"
d.StudyInstanceUID = st.StudyInstanceUID
d.SeriesInstanceUID = ""
d.InstanceNumber = ""
d.Modality = ""
d.SeriesNumber = ""
d.SeriesDescription = ""
d.AccessionNumber = ""
d.SeriesDate = ""
d.SeriesTime = ""
d.SeriesID = ""
d.NumberOfSeriesRelatedInstances = ""
series = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
# print series uid and number of instances
if series:
for se in series:
print "\n\n---------------------------Series---------------------------\n\n"
print "\n\n\n series", se
print " %15s - %10s - %35s - %5s" % (se.SeriesNumber, se.Modality, se.SeriesDescription, se.NumberOfSeriesRelatedInstances)
print "Release association"
assoc.Release(0)
# done
MyAE.Quit()
else:
print "Failed to create Association."
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4:
Put everything inside functions and call them wherever you like:
#!/usr/bin/python
"""
For help on usage,
python qrscu.py -h
"""
import os
import argparse
from netdicom.applicationentity import AE
from netdicom.SOPclass import *
from dicom.dataset import Dataset, FileDataset
from dicom.UID import ExplicitVRLittleEndian, ImplicitVRLittleEndian, ExplicitVRBigEndian
import netdicom
import tempfile
# call back
def OnAssociateResponse(association):
print "Association response received"
def OnAssociateRequest(association):
print "Association resquested"
return True
def OnReceiveStore(SOPClass, DS):
print "Received C-STORE", DS.PatientName
try:
# do something with dataset. For instance, store it.
file_meta = Dataset()
file_meta.MediaStorageSOPClassUID = '1.2.840.10008.5.1.4.1.1.2'
file_meta.MediaStorageSOPInstanceUID = "1.2.3" # !! Need valid UID here
file_meta.ImplementationClassUID = "1.2.3.4" # !!! Need valid UIDs here
filename = os.path.join(tempfile.gettempdir(), DS.SOPInstanceUID + '%s.dcm')
ds = FileDataset(filename, {}, file_meta=file_meta, preamble="\0" * 128)
ds.update(DS)
ds.save_as(filename)
print "File %s written" % filename
except Exception as e:
print "Some exception occured", e
# must return appropriate status
return SOPClass.Success
def print_data(remotehost, remoteport, searchstring, local_port=9999,
calling_title='PYNETDICOM', called_title='REMOTESCU',
ts=(ExplicitVRLittleEndian, ImplicitVRLittleEndian, ExplicitVRBigEndian)):
# create application entity with Find and Move SOP classes as SCU and
# Storage SOP class as SCP
MyAE = AE(calling_title, local_port, [PatientRootFindSOPClass,
PatientRootMoveSOPClass,
VerificationSOPClass], [StorageSOPClass], ts)
MyAE.OnAssociateResponse = OnAssociateResponse
MyAE.OnAssociateRequest = OnAssociateRequest
MyAE.OnReceiveStore = OnReceiveStore
MyAE.start()
# remote application entity
RemoteAE = dict(Address=remotehost, Port=remoteport, AET=called_title)
# create association with remote AE
print "Request association"
assoc = MyAE.RequestAssociation(RemoteAE)
# perform a DICOM ECHO
print "DICOM Echo ... ",
if not assoc:
print "Failed to create Association."
return
st = assoc.VerificationSOPClass.SCU(1)
print 'done with status "%s"' % st
print "DICOM FindSCU ... ",
print "\n\n----------------------------------------------------------------------\n\n"
d = Dataset()
d.StudyDate = searchstring
d.QueryRetrieveLevel = "STUDY"
d.PatientID = "*"
study = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
print 'done with status "%s"' % st
print "\n\n\n Cont...", study
print "\n\n----------------------------------------------------------------------\n\n"
# loop on patients
for pp in study:
print "\n\n----------------------Pateint Detals------------------------------------------------\n\n"
print "%s - %s" % (pp.StudyDate, pp.PatientID)
# find studies
d = Dataset()
d.PatientID = pp.PatientID
d.QueryRetrieveLevel = "STUDY"
d.PatientName = ""
d.StudyInstanceUID = ""
d.StudyDate = ""
d.StudyTime = ""
d.StudyID = ""
d.ModalitiesInStudy = ""
d.StudyDescription = ""
studies = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
# loop on studies
for st in studies:
print "\n study :: ", studies
print "\n\n---------------------------Study---------------------------\n\n"
print " %s - %s %s" % (st.StudyDescription, st.StudyDate, st.StudyTime)
d = Dataset()
d.QueryRetrieveLevel = "SERIES"
d.StudyInstanceUID = st.StudyInstanceUID
d.SeriesInstanceUID = ""
d.InstanceNumber = ""
d.Modality = ""
d.SeriesNumber = ""
d.SeriesDescription = ""
d.AccessionNumber = ""
d.SeriesDate = ""
d.SeriesTime = ""
d.SeriesID = ""
d.NumberOfSeriesRelatedInstances = ""
series = [x[1] for x in assoc.PatientRootFindSOPClass.SCU(d, 1)][:-1]
# print series uid and number of instances
for se in series:
print "\n\n---------------------------Series---------------------------\n\n"
print "\n\n\n series", se
print " %15s - %10s - %35s - %5s" % (se.SeriesNumber, se.Modality, se.SeriesDescription, se.NumberOfSeriesRelatedInstances)
print "Release association"
assoc.Release(0)
# done
MyAE.Quit()
def parse_commandline():
# parse commandline
parser = argparse.ArgumentParser(description='storage SCU example')
print "parser", parser
parser.add_argument('remotehost')
parser.add_argument('remoteport', type=int)
parser.add_argument('searchstring')
parser.add_argument('-p', help='local server port', type=int, default=9999)
parser.add_argument('-aet', help='calling AE title', default='PYNETDICOM')
parser.add_argument('-aec', help='called AE title', default='REMOTESCU')
parser.add_argument('-implicit', action='store_true', help='negociate implicit transfer syntax only', default=False)
parser.add_argument('-explicit', action='store_true', help='negociate explicit transfer syntax only', default=False)
args = parser.parse_args()
print "args :::: ", type(args), args
if args.implicit:
ts = [ImplicitVRLittleEndian]
elif args.explicit:
ts = [ExplicitVRLittleEndian]
else:
ts = [
ExplicitVRLittleEndian,
ImplicitVRLittleEndian,
ExplicitVRBigEndian
]
return args, ts
if __name__ == '__main__':
args, ts = parse_commandline()
print_data(args.remotehost, args.remoteport, args.searchstring, args.p, args.aet, args.aec, ts)
and use it like:
import your_module
your_module.print_data(remotehost, remoteport, searchstring)
You can incorporate and use the script above in your own code by using the pyhton module subprocess. It will let you run the script with arguments that you define depending on your variables or objects.
Example: Let say that you have some variables your_arg_1...your_arg_n that can be consumed by grscu.py. Then you can pass these variables to the script with
import subprocess
r = subprocess.check_call(['grscu.py', your_arg_1, your_arg_2, ..., your_arg_n])
The "args = parser.parse_args()" in the script will grab the variables and pass them to the MyAE object. For more information about argparse, see link.
I'm new to Ubuntu (and the Python scripts that go with it) and I've been hitting this error with the iTunesToRhythm script.
**Traceback (most recent call last):
File "/home/amylee/iTunesToRhythm.py", line 220, in <module>
main(sys.argv)
File "/home/amylee/iTunesToRhythm.py", line 48, in main
match = correlator.correlateSong( song, options.confirm, options.fastAndLoose, options.promptForDisambiguate )
File "/home/amylee/iTunesToRhythm.py", line 133, in correlateSong
matches = self.parser.findSongBySize( song.size );
AttributeError: 'NoneType' object has no attribute 'findSongBySize'**
I understand the concept behind fixing the issue but have no idea how to go about it. I've looked at answers to similar problems but none really help me, especially since I have no clue as to what I am doing. I've included the full script below. Thanks in advance, dudes who know way more about this stuff than I do.
----iTunesToRhythm.py----
import sys
import platform
if platform.system() == "Darwin":
sys.path.append('/sw/lib/python2.5/site-packages/')
from dumpitunesmac import iTunesMacParser, iTunesMacSong
import libxml2
import linecache
from optparse import OptionParser, OptionGroup
from dumprhythm import RhythmLibraryParser, RhythmSong
from dumpitunes import iTunesLibraryParser, iTunesSong
def main(argv):
# process command line
options, args = processCommandLine(argv)
print "Reading input from " + args[0]
inputParser = getParser(args[0], options )
print "Writing to output " + args[1]
destinationParser = getParser(args[1], options )
#retrieve destination songs
allDestinationSongs = destinationParser.getSongs()
# go through each song in destination library
correlator = SongCorrelator(inputParser)
for song in allDestinationSongs:
print song.artist + " - " + song.album + " - " + song.title + " - " + str(song.size)
if song.size != None and song.size != "Unknown":
# find equivalent itunes song
match = correlator.correlateSong( song, options.confirm, options.fastAndLoose, options.promptForDisambiguate )
# update database, if match
if match != None and options.writeChanges == True:
if options.noratings == False:
song.setRating( match.rating )
print "\t\t\tRating changed to " + str( match.rating )
if options.noplaycounts == False:
song.setPlaycount( match.playcount )
print "\t\t\tPlay count changed to " + str( match.playcount )
# dump summary results
print "\nSummary\n------------------------------------"
print "manually resolved matches = " + str( correlator.manuallyResolvedMatches)
print "full matches = " + str( correlator.fullMatches )
print "partial matches = " + str( correlator.partialMatches)
print "no matches = " + str( correlator.zeroMatches )
print "unresolved ambiguous matches = " + str( correlator.ambiguousMatches )
# save
if options.writeChanges == True:
destinationParser.save()
print "Changes were written to destination"
else:
print "Changes were not written to destination \n\tuse -w to actually write changes to disk"
def getParser( file, options ):
if file == "mysql":
print "\tassuming amarok database"
return AmarokLibraryParser(options.servername, options.database, options.username, options.password )
if file == "itunes":
print "\tassuming itunes on the mac"
return iTunesMacParser()
desc = linecache.getline( file, 2)
if desc.find("Apple Computer") != -1:
#open itunes linbrary
print "\tdetected Itunes library"
return iTunesLibraryParser(file);
if desc.find("rhythmdb") != -1:
print "\tdetected Rhythm box library"
return RhythmLibraryParser(file)
def processCommandLine( argv ):
parser = OptionParser("iTunesToRhythm [options] <inputfile>|itunes|mysql <outputfile>|mysql|itunes")
parser.add_option("-c", "--confirm", action="store_true", dest="confirm", default = False, help="confirm every match" )
parser.add_option("-w", "--writechanges", action="store_true", dest="writeChanges", default = False, help="write changes to destination file" )
parser.add_option("-a", "--disambiguate", action="store_true", dest="promptForDisambiguate", default = False, help="prompt user to resolve ambiguities" )
parser.add_option("-l", "--fastandloose", action="store_true", dest= "fastAndLoose", default = False, help = "ignore differences in files name when a file size match is made against a single song. Will not resolve multiple matches" )
parser.add_option("--noplaycounts", action="store_true", dest= "noplaycounts", default = False, help = "do not update play counts" )
parser.add_option("--noratings", action="store_true", dest= "noratings", default = False, help = "do not update ratings" )
amarokGroup = OptionGroup(parser, "Amarok options", "Options for connecting to an Amarok MySQL remote database")
amarokGroup.add_option("-s", "--server", dest="servername", help = "host name of the MySQL database server")
amarokGroup.add_option("-d", "--database", dest="database", help = "database name of the amarok database")
amarokGroup.add_option("-u", "--username", dest="username", help = "login name of the amarok database")
amarokGroup.add_option("-p", "--password", dest="password", help = "password of the user")
parser.add_option_group(amarokGroup)
# parse options
options, args = parser.parse_args()
# check that files are specified
if len(args) != 2:
parser.print_help()
parser.error( "you must supply 2 file names or 1 file name and the word mysql followed by database information. Specyfing itunes will use a running instance of iTunes on the Mac" )
# make surce source & destination are not the same
if args[0] == args[1]:
parser.error("source and destination cannot be the same")
# we're ok
return options, args
class SongCorrelator:
def __init__(self, parser ):
self.parser = parser
self.zeroMatches = 0
self.fullMatches = 0
self.ambiguousMatches = 0;
self.partialMatches = 0;
self.manuallyResolvedMatches = 0;
# attempt to find matching song in database
def correlateSong( self, song, confirm, fastAndLoose, promptForDisambiguate ):
match = None
matches = self.parser.findSongBySize( song.size );
matchcount = len(matches)
# no results
if matchcount == 0:
print "\t no matches found"
self.zeroMatches = self.zeroMatches + 1
# full match
elif matchcount == 1:
match = matches[0]
if match.title == song.title:
print "\t 100% match on " + self.dumpMatch( match )
self.fullMatches = self.fullMatches + 1
else:
if fastAndLoose == False:
match = self.disambiguate( song, matches, promptForDisambiguate )
else:
print "\t 50% match on " + self.dumpMatch( match )
self.partialMatches = self.partialMatches + 1
# multiple matches
else:
print "\t multiple matches"
for match in matches:
print "\t\t " + self.dumpMatch( match )
# attempt a resolution
match = self.disambiguate( song, matches, promptForDisambiguate )
#review
if confirm == True:
foo = raw_input( 'press <enter> to continue, Ctrl-C to cancel')
#done
return match
def dumpMatch( self, match ):
return match.title + ", playcount = " + str(match.playcount) + ", rating = " + str(match.rating)
def disambiguate(self,song,matches,prompt):
# attempt to disambiguate by title
print "\t looking for secondary match on title"
titlematchcount = 0
for match in matches:
if match.title == song.title:
titlematchcount = titlematchcount + 1
latstitlematch = match
if titlematchcount == 1:
# we successfully disambiguated using the title
print "\t\t disambiguated using title"
self.fullMatches = self.fullMatches + 1
return latstitlematch
if prompt == True:
print "\t\t cannot disambiguate. Trying to match " + song.filePath
print "Please select file or press <Enter> for no match:"
numMatch = 0
for match in matches:
numMatch = numMatch + 1
print "\t\t\t\t[" + str(numMatch) + "] " + self.dumpMatch(match) + ", " + match.filePath
selection = self.inputNumber("\t\t\t\t? ", 1, len(matches) )
if selection > 0:
self.manuallyResolvedMatches = self.manuallyResolvedMatches + 1
return matches[selection - 1]
# user did not select, record ambiguity
self.ambiguousMatches = self.ambiguousMatches + 1
return None
def inputNumber(self, msg, min, max):
result = raw_input(msg)
if len(result) == 0:
return 0
try:
resultNum = int(result)
if resultNum < min or resultNum > max:
print "out of range"
return self.inputNumber( msg, min, max )
return resultNum
except:
print "invalid input"
return self.inputNumber(msg, min, max)
if __name__ == "__main__":
main(sys.argv)
I'm the original developer. I updated the script to throw an exception if the file format is not recognized (I think this is what you are running into). I also incorporated some useful patches from another user.
Please download the files again and e-mail me if you still have trouble.
Your problem appears to be that getParser is returning None, presumably because all the if conditions have failed.
Check that args[0] and options are the values that you expect them to be.
I'd suggest raising an exception at the end of the getParser method if the arguments are not valid so that the error is raised close to the cause of the problem rather in some unrelated code much later.