Python loop overwrites previous text written to json file [duplicate] - python

This question already has answers here:
Difference between modes a, a+, w, w+, and r+ in built-in open function?
(9 answers)
Closed last month.
I have a python script that performs an sql query and writes the output of the query to a .json file. However, every time it writes to the json file for me it overwrites the previously written text. I want each sql query to be written to a new and separate .json. Below is my code that is not working. Any help would be greatly appreciated!
from __future__ import print_function
try:
import psycopg2
except ImportError:
raise ImportError('\n\033[33mpsycopg2 library missing. pip install psycopg2\033[1;m\n')
sys.exit(1)
import re
import sys
import json
DB_HOST = 'crt.sh'
DB_NAME = 'certwatch'
DB_USER = 'guest'
OUTPUT_DIR="output/"
def connect_to_db(domain_name):
try:
conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST))
cursor = conn.cursor()
cursor.execute("SELECT ci.NAME_VALUE NAME_VALUE FROM certificate_identity ci WHERE ci.NAME_TYPE = 'dNSName' AND reverse(lower(ci.NAME_VALUE)) LIKE reverse(lower('%{}'));".format(domain_name))
except:
print("\n\033[1;31m[!] Unable to connect to the database\n\033[1;m")
return cursor
def get_unique_emails(cursor, domain_name):
unique_emails = []
for result in cursor.fetchall():
matches=re.findall(r"\'(.+?)\'",str(result))
for email in matches:
if email not in unique_emails:
if "{}".format(domain_name) in email:
unique_emails.append(email)
return unique_emails
def print_unique_emails(unique_emails):
print("\033[1;32m[+] Total unique emails found: {}\033[1;m".format(len(unique_emails)))
for unique_email in sorted(unique_emails):
print(unique_email)
if __name__ == '__main__':
filepath = 'test.txt'
with open(filepath) as fp:
for cnt, domain_name in enumerate(fp):
print("Line {}: {}".format(cnt, domain_name))
print(domain_name)
domain_name = domain_name.rstrip()
cursor = connect_to_db(domain_name)
unique_emails = get_unique_emails(cursor, domain_name)
print_unique_emails(unique_emails)
outfilepath = OUTPUT_DIR + unique_emails + ".json"
with open(outfilepath, 'w') as outfile:
outfile.write(json.dumps(unique_emails, sort_keys=True, indent=4))

with open(outfilepath, 'w') as outfile:
outfile.write(json.dumps(unique_emails, sort_keys=True, indent=4))
You are currently opening the file to write. You want to append to the file. You can do this by changing w to a
with open(outfilepath, 'a') as outfile:
outfile.write(json.dumps(unique_emails, sort_keys=True, indent=4))
You can read the documentation on open() here.

I think it's because your not looping when your writing the json file, you have a single write, so it just writes to the one file. So you need to do something like what you did when you ... enumerate(fp):. Make another for loop, looping over each domain, and change your OUTPUT_DIR + unique_emails + ".json" to be OUTPUT_DIR + domain_name + ".json".

Related

Exporting results to CSV file

I'm pretty new to coding and I'm stuck on this problem. Written in python.
import logging
import os
import sys
import json
import pymysql
import requests
import csv
## set up logger to pass information to Cloudwatch ##
#logger = logging.getLogger()
#logger.setLevel(logging.INFO)
## define RDS variables ##
rds_host = 'host'
db_username = 'username'
db_password = 'password'
db_name = 'name'
## connect to rds database ##
try:
conn = pymysql.connect(host=rds_host, user=db_username, password=db_password, db=db_name, port=1234,
connect_timeout=10)
except Exception as e:
print("ERROR: Could not connect to MySql instance.")
print(e)
sys.exit()
print("SUCCESS: Connection to RDS mysql instance succeeded")
def main():
with conn.cursor() as cur:
cur.execute("SELECT Domain FROM domain_reg")
domains = cur.fetchall()
# logger.info(domains)
conn.close()
new_domains = []
for x in domains:
a = "http://" + x[0] + ("/orange/health")
new_domains.append(a)
print(new_domains)
for y in new_domains:
try:
response = requests.get(y)
if response.status_code == 200:
print("Domain " + y + " exists")
else:
print("Domain " + y + " does not exist; Status code = " + str(response.status_code))
except Exception as e:
print("Exception: With domain " + y)
with open("new_orangeZ.csv", "w", newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
for line in new_domains:
writer.writerow([new_domains])
if __name__ == "__main__":
main()
This code does create a CSV file, but it's not exactly exporting what I want it to export. It only creates a csv file listing only the "Y" and I understand that because i'm calling "new_domains" in writer.writerow. I'm trying to figure out how to also export the print function that matches with the if else statement into the csv, if that makes sense. Sorry if this may sounds gibberish, like I said, I'm super new to coding. Was hoping to post a picture of what I get in the csv file vs what I wanted but I'm new to stackoverflow also so it doesn't allow me to post pictures haha.
Thank you!!!
print() only displays the strings on the screen.
You need to remember them somewhere, like in a new list:
result=[] #somewhere at the beginning
...
print("Domain " + y + " exists")
result.append([y,"Domain " + y + " exists"]) #after each print
and save both in the CSV file with something like:
for domain,status in new_domains:
writer.writerow([domain, status])
It's easier to save the domains again, as the for / in may not keep their order.
By the way, with "for line in new_domains:" I guess you should have written "line" in the CSV insead of "new_domains"...
Question: m trying to figure out how to also export the print function that matches with the if else statement into the csv
If you want to print into a file, you have to give the file object to print(..., file=<my file object>. In your example, move the for ... inside of with ....
Note: It's no good Idea to use csv.writer(... for non csv data"
with open("test", "w", newline='') as my_file_object:
for y in new_domains:
From Python Documentation - Built-in Functions
print(*objects, sep=' ', end='\n', file=sys.stdout, flush=False)
Print objects to the text stream file, separated by sep and followed by end.
sep, end, file and flush, if present, must be given as keyword arguments.
The file argument must be an object with a write(string) method; if it is not present or None, sys.stdout will be used.

SQL query returns blank output when running inside Python script

I have a python script that is supposed to loop through a text file and gather the domain as an argument from each line in the text file. Then it is supposed to use the domain as an argument in a SQL query. The issue is when I'm passing in the domain_name as an argument the JSON output the script produces is blank. If I set the domain_name argument in my sql query directly inside the query then the script outputs perfect JSON format. As you can see in the top of my script right below def connect_to_db() I start to loop through the text file. I'm not sure where in my code the error is occurring by any assistance would be greatly appreciated!
Code
from __future__ import print_function
try:
import psycopg2
except ImportError:
raise ImportError('\n\033[33mpsycopg2 library missing. pip install psycopg2\033[1;m\n')
sys.exit(1)
import re
import sys
import json
import pprint
DB_HOST = 'crt.sh'
DB_NAME = 'certwatch'
DB_USER = 'guest'
def connect_to_db():
filepath = 'test.txt'
with open(filepath) as fp:
for cnt, domain_name in enumerate(fp):
print("Line {}: {}".format(cnt, domain_name))
print(domain_name)
domain_name = domain_name.rstrip()
conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST))
cursor = conn.cursor()
cursor.execute(
"SELECT c.id, x509_commonName(c.certificate), x509_issuerName(c.certificate) FROM certificate c, certificate_identity ci WHERE c.id = ci.certificate_id AND ci.name_type = 'dNSName' AND lower(ci.name_value) = lower('%s') AND x509_notAfter(c.certificate) > statement_timestamp();".format(
domain_name))
unique_domains = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(unique_domains)
outfilepath = domain_name + ".json"
with open(outfilepath, 'a') as outfile:
outfile.write(json.dumps(unique_domains, sort_keys=True, indent=4))
if __name__ == "__main__":
connect_to_db()
Don't use format to create your SQL statement. Use ? placeholders and then a tuple of the values to insert:
c.execute('''SELECT c.id, x509_commonName(c.certificate),
x509_issuerName(c.certificate) FROM certificate c, certificate_identity ci WHERE
c.id= ci.certificate_id AND ci.name_type = 'dNSName' AND lower(ci.name_value) =
lower(?) AND x509_notAfter(c.certificate) > statement_timestamp()''',(domain_name,))
More generically:
c.execute('''SELECT columnX FROM tableA where columnY = ? AND columnZ =?'''
(desired_columnY_value,desired_columnZ_value))

Python script doesn't work when passing in a argument from a text file

I have a python script that is supposed to loop through a text file and gather the domain as an argument from each line in the text file. Then it is supposed to use the domain as an argument in a SQL query. The issue is when I'm passing in the domain as an argument the JSON output the script produces is blank. If I set the domain_name argument in my sql query directly inside the query then the script outputs perfect JSON format. As you can see in the top of my script right below def connect_to_db() I start to loop through the text file. I'm not sure where in my code the error is occurring by any assistance would be greatly appreciated!
Code
from __future__ import print_function
try:
import psycopg2
except ImportError:
raise ImportError('\n\033[33mpsycopg2 library missing. pip install psycopg2\033[1;m\n')
sys.exit(1)
import re
import sys
import json
import pprint
DB_HOST = 'crt.sh'
DB_NAME = 'certwatch'
DB_USER = 'guest'
def connect_to_db():
filepath = 'test.txt'
with open(filepath) as fp:
for cnt, domain_name in enumerate(fp):
print("Line {}: {}".format(cnt, domain_name))
print(domain_name)
domain_name = domain_name.rstrip()
conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST))
cursor = conn.cursor()
cursor.execute(
"SELECT c.id, x509_commonName(c.certificate), x509_issuerName(c.certificate) FROM certificate c, certificate_identity ci WHERE c.id = ci.certificate_id AND ci.name_type = 'dNSName' AND lower(ci.name_value) = lower('**domain_name**') AND x509_notAfter(c.certificate) > statement_timestamp();")
unique_domains = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(unique_domains)
outfilepath = domain_name + ".json"
with open(outfilepath, 'a') as outfile:
outfile.write(json.dumps(unique_domains, sort_keys=True, indent=4))
if __name__ == "__main__":
# filepath = 'test.txt'
# with open(filepath) as fp:
# for cnt, domain_name in enumerate(fp):
# print("Line {}: {}".format(cnt, domain_name))
# print(domain_name)
# domain_name = domain_name.rstrip()
connect_to_db()

Search and replace specific line which starts with specific string in a file

My requirement is to open a properties file and update the file, for update purpose i need to search for a specific string which stores the url information. For this purpose i have written the below code in python:
import os
owsURL="https://XXXXXXXXXXXXXX/"
reowsURL = "gStrOwsEnv = " + owsURL + "/" + "OWS_WS_51" + "/"
fileName='C:/Users/XXXXXXXXXXX/tempconf.properties'
if not os.path.isfile(fileName):
print("!!! Message : Configuraiton.properties file is not present ")
else:
print("+++ Message : Located the configuration.properties file")
with open(fileName) as f:
data = f.readlines()
for m in data:
if m.startswith("gStrOwsEnv"):
print("ok11")
m = m.replace(m,reowsURL)
after executing the program i am not able to update the properties file.
Any help is highly appreciated
Sample Content of file:
# ***********************************************
# Test Environment Details
# ***********************************************
# Application URL pointing to test execution
#gStrApplicationURL =XXXXXXXXXXXXXXXX/webservices/person
#gStrApplicationURL = XXXXXXXXXXXXXX/GuestAPIService/ProxyServices/
# FOR JSON
#gStrApplicationURL = XXXXXXXXXXXXXX
#SOAP_gStrApplicationURL =XXXXXXXXXXXXXXXXXXXXXXX
#(FOR WSDL PARSING)
version = 5
#v9
#SOAP_gStrApplicationURL = XXXXXXXXXXX/XXXXXXXXX/XXXXXXXXX/
#v5
SOAP_gStrApplicationURL = XXXXXXXXXXXXXXX/OWS_WS_51/
gStrApplicationXAIServerPath=
gStrEnvironmentName=XXXXXXXXX
gStrOwsEnv = XXXXXXXXXXXXXXXXXXXX/OWS_WS_51/
gStrConnectEnv = XXXXXXXXXXXXXXXXX/OWSServices/Proxy/
gStrSubscriptionKey =XXXXXXXXXXXXXXXXXXXXXX
I'm pretty sure that this is not the best way of doing that, but this is still one way:
with open(input_file_name, 'r') as f_in, open(output_file_name, 'w') as f_out:
for line in f_in:
if line.startswith("gStrOwsEnv"):
f_out.write(reowsURL)
else:
f_out.write(line)
That script copy every line of input_file_name into output_file_name except the lines that you want to change.

Script that reads PDF metadata and writes to CSV

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Categories