How to export dict list to csv cleanly? - python

I have a script I'm writing to make pulling data from my fantasy football league easy and exported in a format that can be played with in Excel easily.
The script I have attached only contains the relevant parts to this questions as the larger script I have written has a lot of moving parts that doesn't apply here.
I'm essentially pulling this players.get_all_players() data from the Sleeper platform using the Sleeper-API-Wrapper (Github link here).
My script will take player data and put it into a .csv like this, with the player ID in the top row and all the info in a single cell below the ID. Screenshot of this below.
Excel .csv screenshot
How can I export this so that the data is nicely formatted into separate rows? I have a different spreadsheet where I'd like to be able to pull this data to automatically.
Alternatively, if I'm doing this in a really roundabout way, please let me know! This is the JSON response from the platform: JSON Response
# 9 All players - players.get_all_players()
warning = 1
while warning == 1:
print("%s%s\n\n\nWARNING:%s" % (fg(15), bg(9), attr(0)))
print("%s%sthe 'all players' option is intensive and may freeze your PC for several minutes.%s" % (fg(15), bg(0), attr(1)))
warning = input("continue anyway? (y/n)\n")
if warning == "n":
pe_loop = 0
action = 0
elif warning == "y":
name = "all players"; file = name
output = players.get_all_players()
break
else:
print("Not a valid option, try again.")
warning = 1
overwrite = 0
name_change = 0
while action == 0:
try:
action = int(input("%s%s\n1 - print\n2 - export to Excel\n3 - back to tasks\n4 - end\n--------------------\n%s" % (fg(14), bg(0), attr(1))))
except ValueError:
print("Not a valid option, try again.")
## Print
if action == 1 and week != 18:
print(output)
break
elif action == 1 and week == 18:
week = 0
while week < 18:
week += 1
if task == 3:
output = league.get_matchups(week)
elif task == 4:
output = league.get_transactions(week)
print(output)
## Export
elif action == 2:
path = os.path.join(parent_dir, file)
name_change = input("\nDo you want to name the file? (y/n)\n")
if name_change == "y":
name = input("\nEnter file name now:\n")
if name_change == "n":
file_path = path + "\\" + name + '_' + str(year) + ".xlsx"
if os.path.isfile(file_path) == True:
overwrite = input("\nFile name... '" + name + "' already exists! Would you like to overwrite this file? (y/n)\n")
if overwrite == "n":
count = 0
while os.path.isfile(file_path) == True:
count += 1
new_name = name + "_" + str(count)
file_path = path + "\\" + new_name + ".xlsx"
else:
name = new_name
print("\nThe new file was automatically named: " + new_name + "_wk" + str(week) + "\nand placed in: " + path)
if os.path.isdir(path) == False and overwrite == 0:
os.mkdir(path)
print("\nCreating new file path... " + file + "\n")
elif os.path.isdir(path) == True and overwrite == 0:
print("\nFile path... '" + file + "' exists!\n")
toCSV = output
# 9 All Players CSV file
with open(parent_dir + file + "\\" + name + ".csv", 'w', encoding='utf8', newline='') as output_file:
fc = csv.DictWriter(output_file, output.keys())
fc.writeheader()
fc.writerow(toCSV)

It turns out that sleeper_wrapper exposes a method players.get_players_df that gives you a pandas DataFrame containing all players.
Write that to a csv file using to_csv as suggested in the comments.
Strip down your code to receive better answers faster :)
This is the code that your question needs:
from sleeper_wrapper import Players
import csv
players = Players()
toCSV = players.get_all_players()
with open(parent_dir + file + "\\" + name + ".csv", 'w', encoding='utf8', newline='') as output_file:
fc = csv.DictWriter(output_file, output.keys())
fc.writeheader()
fc.writerow(toCSV)
This is how you write the csv using pandas:
import pandas as pd
from sleeper_wrapper import Players
players = Players()
all_players = players.get_all_players()
# stolen from https://github.com/NotTheCrocHunter/sleeper-api-wrapper/blob/91d8cf1b64cf55884b4c4746d53ccd1259d11c1f/sleeper_wrapper/players.py#L41
# because that method is unavailable in the version of sleeper_wrapper in PyPI
all_players_df = pd.DataFrame.from_dict(all_players, orient="index")
# all_players_df contains some information on teams as well, maybe you want to filter that out...
all_players_df.to_csv("your_output_file.csv")

Related

Error while trying to generate xml file with specific name

I tried to generate a file named, for example, 23-10-2022|21-03-11.xml or if the user enters his own name userGeneratedName23-10-2022-21-03-11.xml. I don't know why when I tried to specify a particular folder where the generated file should be saved the program throws me an Invalid argument error. I suspect that I am using join incorrectly, I don't really know how to correct it
if not os.path.exists("Generated XMLs"):
os.makedirs("Generated XMLs")
#open file
today = date.today()
now = datetime.now()
#if filename is not specified, create file with today's date and time of creation
if filename == "":
filename = today.strftime("%d-%m-%Y") +"|"+ now.strftime("%H-%M-%S")
#if filename is specified,ad at the and of filename today's date and time of creation
else:
filename = filename + today.strftime("%d-%m-%Y")+"|"+ now.strftime("%H-%M-%S")
# open file wchich is in Generated XMLs folder and name it with variable filename
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")
#write to file
#write header
file.write("\n<root>\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
file.write("\t<POZYCJE>\n")
#write data
# ask user if he wont netto or brutto meters and ad flag
flag = input("Enter 'n' if you want to use netto meters or 'b' if you want to use brutto meters: ")
for i in range(len(convertedData)):
file.write("\t\t<POZYCJA>\n")
file.write("\t\t\t<LP>" + (i+1) + "</LP>\n")
file.write("\t\t\t<TOWAR>\n")
file.write("\t\t\t\t<KOD>" + convertedData[i][0] + " " + convertedData[i][1] + "</KOD>\n")
file.write("\t\t\t\t<NAZWA>" + convertedData[i][0] + " " + convertedData[i][1] + "</NAZWA>\n")
file.write("\t\t\t\t<OPIS/>")
file.write("\t\t\t\t<EAN/>")
file.write("\t\t\t\t<SWW/>")
file.write("\t\t\t\t<NUMER_KATALOGOWY/>")
file.write("\t\t\t\t<MPP>" + "0" + "</MPP>\n")
file.write("\t\t\t</TOWAR>\n")
file.write("\t\t\t<STAWKA_VAT>\n")
file.write("\t\t\t\t<STAWKA>" + "23.00" + "</STAWKA>\n")
file.write("\t\t\t\t<FLAGA>" + "2" + "</FLAGA>\n")
file.write("\t\t\t\t<ZRODLOWA>" + "0.00" + "</ZRODLOWA>\n")
file.write("\t\t\t</STAWKA_VAT>\n")
file.write("\t\t\t<CENY>\n")
file.write("\t\t\t\t<CENAZCZTEREMAMIEJSCAMI>0</CENAZCZTEREMAMIEJSCAMI>\n")
file.write("\t\t\t\t<POCZATKOWA_WAL_CENNIKA>00.0000</POCZATKOWA_WAL_CENNIKA>\n")
file.write("\t\t\t\t<POCZATKOWA_WAL_DOKUMENTU>00.0000</POCZATKOWA_WAL_DOKUMENTU>\n")
file.write("\t\t\t\t<PO_RABACIE_WAL_CENNIKA>00.0000</PO_RABACIE_WAL_CENNIKA>\n")
file.write("\t\t\t\t<PO_RABACIE_PLN>00.0000</PO_RABACIE_PLN>\n")
file.write("\t\t\t\t<PO_RABACIE_WAL_DOKUMENTU>00.0000</PO_RABACIE_WAL_DOKUMENTU>\n")
file.write("\t\t\t</CENY>\n")
file.write("\t\t\t<WALUTA>\n")
file.write("\t\t\t\t<SYMBOL>PLN</SYMBOL>\n")
file.write("\t\t\t\t<KURS_L>1.00</KURS_L>\n")
file.write("\t\t\t\t<KURS_M>1</KURS_M>\n")
file.write("\t\t\t</WALUTA>\n")
file.write("\t\t\t<RABAT>0.00</RABAT>\n")
file.write("\t\t\t<WARTOSC_NETTO>0.00</WARTOSC_NETTO>\n")
file.write("\t\t\t<WARTOSC_BRUTTO>0.00</WARTOSC_BRUTTO>\n")
file.write("\t\t\t<WARTOSC_NETTO_WAL>00.00</WARTOSC_NETTO_WAL>\n")
file.write("\t\t\t<WARTOSC_BRUTTO_WAL>833.94</WARTOSC_BRUTTO_WAL>\n")
if flag == "n":
file.write("\t\t\t<ILOSC>" + convertedData[i][3] + "00" + "</ilosc>\n")
elif flag == "b":
file.write("\t\t\t<ILOSC>" + convertedData[i][2] + "00" + "</ilosc>\n")
else:
print("Error: Wrong flag. Enter 'n' or 'b'.")
file.write("\t\t\t<JB>" + convertedData[i][4] + "</JB>\n")
file.write("\t\t\t<JM_CALKOWITE>0.00</JM_CALKOWITE>\n")
file.write("\t\t\t<JM_ZLOZONA>\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t</JM_ZLOZONA>\n")
file.write("\t\t\t<JMZ>" + convertedData[i][4] + "</JMZ>\n")
file.write("\t\t\t<JM_PRZELICZNIK_L>1.00</JM_PRZELICZNIK_L>\n")
file.write("\t\t\t<JM_PRZELICZNIK_M>1</JM_PRZELICZNIK_M>\n")
file.write("\t\t</POZYCJA>\n")
#write footer
file.write("\t</POZYCJE>\n")
file.write("</root>")
#close file
file.close()
The exact error I get:
Traceback (most recent call last):
File "C:\Users\reczul\PycharmProjects\pythonProject5\main.py", line 23, in <module>
main()
File "C:\Users\reczul\PycharmProjects\pythonProject5\main.py", line 13, in main
XMLCreator.CreateXML(DataConverter.ConvertData(data))
File "C:\Users\reczul\PycharmProjects\pythonProject5\venv\Functions\XMLCreator.py", line 20, in CreateXML
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")
OSError: [Errno 22] Invalid argument: 'Generated XMLs\\24-10-2022|09-23-45.xml'
Process finished with exit code 1
I can not use "|" also I had to convert filename to str.
if not os.path.exists("Generated XMLs"):
os.makedirs("Generated XMLs")
#open file
today = date.today()
now = datetime.now()
#if filename is not specified, create file with today's date and time of creation
if filename == "":
filename = today.strftime("%d-%m-%Y") +"--"+ now.strftime("%H-%M-%S")
#if filename is specified,ad at the and of filename today's date and time of creation
else:
filename = filename + today.strftime("%d-%m-%Y")+"--"+ now.strftime("%H-%M-%S")
filename = str(filename)
# open file wchich is in Generated XMLs folder and name it with variable filename
#file = open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"Generated XMLs",filename + ".xml"), "w")
#cerate XML file which is in Generated XMLs folder
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")

Python - Pandas writing blank files to file [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 2 years ago.
Improve this question
I have a python script that writes to several file formats via Pandas. It can write to CSV/JSON/HTML/Excel.
However for some reason the script is writing blank files. When I open the file this is what I see:
Before printing the file I am printing the dataframe to the screen output so I can validate that the data is there. For example with CSV the output to the screen is this:
CSV data: ,AWS Account,Account Number,Name,Instance ID,AMI ID,Volumes,Private IP,Public IP,Private DNS,Availability Zone,VPC ID,Type,Key Pair Name,State,Launch Date
0,project-client-lab,123456789101,bastion001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-xxxxxxxxxxxxxxx,10.238.2.166,3.214.15.175,ip-10-238-2-166.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,t3.small,project-client-int01,running,March 10 2020
1,project-client-lab,123456789101,logicmonitor001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-0xxxxxxxxxxxxxx,10.238.2.52,,ip-10-238-2-52.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,September 02 2019
2,project-client-lab,123456789101,project-cassandra001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.221,,ip-10-238-2-221.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,January 14 2020
3,project-client-lab,123456789101,project-cassandra003,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.207,,ip-10-238-2-207.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,January 14 2020
4,project-client-lab,123456789101,project-cassandra003,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.203,,ip-10-238-2-203.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,c5.xlarge,project-client-int01,running,January 22 2020
5,project-client-lab,123456789101,project-cassandra001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.209,,ip-10-238-2-209.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,c5.xlarge,project-client-int01,running,January 22 2020
6,project-client-lab,123456789101,haproxy001,i-xxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-xxxxxxxxxxxxxxxxxx,10.238.2.169,54.242.118.165,ip-10-238-2-169.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,February 20 2020
7,project-client-lab,123456789101,logicmonitor002,i-xxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-0c48ff6ebb031008a,10.238.2.69,,ip-10-238-2-69.ec2.internal,us-east-1b,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,September 13 2019
These are the functions that write to file:
def mongo_export_to_file(interactive, aws_account, aws_account_number,instance_col=None,date=None):
create_directories()
if date == None:
format= "%m-%d-%Y"
today = datetime.today()
today = today.strftime(format)
date = today
else:
format= "%m-%d-%Y"
date = datetime.strptime(date,"%m%d%Y")
date = date.strftime(format)
if not instance_col:
_, _, instance_col = set_db()
# make an API call to the MongoDB server
if interactive == 0:
mongo_docs = instance_col.find({})
else:
mongo_docs = instance_col.find({"Account Number": aws_account_number})
# Convert the mongo docs to a DataFrame
docs = pandas.DataFrame(mongo_docs)
# Discard the Mongo ID for the documents
docs.pop("_id")
if __name__ == "__main__":
print("Choose a file format")
print("1. CSV")
print("2. JSON")
print("3. HTML")
print("4. Excel")
choice = input("Enter a number 1-4: ")
choice = int(choice)
else:
choice = 1
if choice == 1:
if __name__ == "__main__":
# export MongoDB documents to CSV
csv_export = docs.to_csv(sep=",") # CSV delimited by commas
print ("\nCSV data:", csv_export)
# Set the CSV output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "csv", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".csv")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date +".csv")
# export MongoDB documents to a CSV file, leaving out the row "labels" (row numbers)
docs.to_csv(output_file, ",", index=False) # CSV delimited by commas
elif choice == 2:
if __name__ == "__main__":
json_export = docs.to_json() # return JSON data
print ("\nJSON data:", json_export)
# Set the JSON output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "json", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".json")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date +".json")
# export MongoDB documents to a CSV file, leaving out the row "labels" (row numbers)
docs.to_json(output_file)
elif choice == 3:
html_str = io.StringIO()
# export as HTML
docs.to_html(
buf=html_str,
classes="table table-striped"
)
if __name__ == "__main__":
# print out the HTML table
print (html_str.getvalue())
# Set the HTML output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "html", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".html")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date + ".html")
# save the MongoDB documents as an HTML table
docs.to_html(output_file)
elif choice == 4:
# Set the Excel output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "excel", "")
time.sleep(5)
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date + ".xlsx")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date + ".xlsx")
# export MongoDB documents to a Excel file, leaving out the row "labels" (row numbers)
writer = ExcelWriter(output_file)
docs.to_excel(writer,"EC2 List",index=False)
writer.save()
writer.close()
if __name__ == "__main__":
exit = input("Exit program (y/n): ")
if exit.lower() == "y" or exit.lower() == "yes":
exit_program()
else:
main()
def print_reports(interactive,aws_account,aws_account_number):
set_db(instance_col=None)
inputDate = input("Enter the date in format 'dd/mm/yyyy': ")
day,month,year = inputDate.split('/')
isValidDate = True
try:
datetime(int(year),int(month),int(day))
except ValueError :
isValidDate = False
print_reports(interactive,aws_account,aws_account_number)
if(isValidDate) :
print(f"Input date is valid: {inputDate}")
format= "%m%d%Y"
inputDate = datetime.strptime(inputDate,"%m/%d/%Y")
inputDate = inputDate.strftime(format)
else:
print(f"Input date is not valid: {inputDate}")
print_reports(interactive,aws_account,aws_account_number)
myclient = connect_db()
mydb = myclient["aws_inventories"]
instance_col = "ec2_list_" + inputDate
instance_col = mydb[instance_col]
mongo_export_to_file(interactive, aws_account, aws_account_number,instance_col,date=inputDate)
This is all my code in this script.
Why is this happening and how to I correct that?
You can view the file in Excel by:
Opening Excel
Going to the "Data" tab
In the "Get & Transform Data" section, click "From Text/CSV"

How to read .evtx file using python?

Guys do anyone know how to read event log file in C:\Windows\System32\winevt\Logs with .evtx extension?
I have already tried to open it using notepad and read using python but notepad says access is denied...
Do anyone know how to do it? Thanks in advance..
This is how you would read the file "Forwarded Events" from the event viewer. You need admin access so I would run it as admin but I it will prompt you for a password if you don't.
import win32evtlog
import xml.etree.ElementTree as ET
import ctypes
import sys
def is_admin():
try:
return ctypes.windll.shell32.IsUserAnAdmin()
except:
return False
if is_admin():
# open event file
query_handle = win32evtlog.EvtQuery(
'C:\Windows\System32\winevt\Logs\ForwardedEvents.evtx',
win32evtlog.EvtQueryFilePath)
read_count = 0
a = 1
while a == 1:
a += 1
# read 1 record(s)
events = win32evtlog.EvtNext(query_handle, 1)
read_count += len(events)
# if there is no record break the loop
if len(events) == 0:
break
for event in events:
xml_content = win32evtlog.EvtRender(event, win32evtlog.EvtRenderEventXml)
# parse xml content
xml = ET.fromstring(xml_content)
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
substatus = xml[1][9].text
event_id = xml.find(f'.//{ns}EventID').text
computer = xml.find(f'.//{ns}Computer').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
#data_name = xml.findall('.//EventData')
#substatus = data_name.get('Data')
#print(substatus)
event_data = f'Time: {time_created}, Computer: {computer}, Substatus: {substatus}, Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}'
print(event_data)
user_data = xml.find(f'.//{ns}UserData')
# user_data has possible any data
else:
ctypes.windll.shell32.ShellExecuteW(None, "runas", sys.executable, " ".join(sys.argv), None, 1)
input()
.evtx is the extension for Windows Eventlog files. It contains data in a special binary format designed by Microsoft so you cannot simply open it in a text editor.
The are open source tools to read .evtx and the NXLog EE can also read .evtx files. (Disclaimer: I'm affiliated with the latter).
I modified the accepted answer a bit as following, so it becomes reusable:
import xml.etree.ElementTree as Et
import win32evtlog
from collections import namedtuple
class EventLogParser:
def __init__(self, exported_log_file):
self.exported_log_file = exported_log_file
def get_all_events(self):
windows_events = []
query_handle = win32evtlog.EvtQuery(str(self.exported_log_file),
win32evtlog.EvtQueryFilePath | win32evtlog.EvtQueryReverseDirection)
while True:
raw_event_collection = win32evtlog.EvtNext(query_handle, 1)
if len(raw_event_collection) == 0:
break
for raw_event in raw_event_collection:
windows_events.append(self.parse_raw_event(raw_event))
return windows_events
def parse_raw_event(self, raw_event):
xml_content = win32evtlog.EvtRender(raw_event, win32evtlog.EvtRenderEventXml)
root = Et.fromstring(xml_content)
ns = "{" + root.tag.split('}')[0].strip('{') + "}"
system = root.find(f'{ns}System')
event_id = system.find(f'{ns}EventID').text
level = system.find(f'{ns}Level').text
time_created = system.find(f'{ns}TimeCreated').get('SystemTime')
computer = system.find(f'{ns}Computer').text
WindowsEvent = namedtuple('WindowsEvent',
'event_id, level, time_created, computer')
return WindowsEvent(event_id, level, time_created, computer)
I use the "python-evtx" library, you can install it using this command:
pip install python-evtx
In my case, I'm not interested in reading records with the "Information" level.
import os
import codecs
from lxml import etree
import Evtx.Evtx as evtx
def evtxFile(absolutePath, filenameWithExt, ext, _fromDate, _toDate):
print("Reading: " + filenameWithExt)
outText = ""
channel = ""
#read the windows event viewer log and convert its contents to XML
with codecs.open(tempFilePath, "a+", "utf-8", "ignore") as tempFile:
with evtx.Evtx(absolutePath) as log:
for record in log.records():
xmlLine = record.xml()
xmlLine = xmlLine.replace(" xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"", "")
xmlParse = etree.XML(xmlLine)
level = parseXMLtoString(xmlParse, ".//Level/text()")
if not level == "0" and not level == "4":
providerName = parseXMLtoString(xmlParse, ".//Provider/#Name")
qualifiers = parseXMLtoString(xmlParse, ".//EventID/#Qualifiers")
timestamp = parseXMLtoString(xmlParse, ".//TimeCreated/#SystemTime")
eventID = parseXMLtoString(xmlParse, ".//EventID/text()")
task = parseXMLtoString(xmlParse, ".//Task/text()")
keywords = parseXMLtoString(xmlParse, ".//Keywords/text()")
eventRecordID = parseXMLtoString(xmlParse, ".//EventRecordID/text()")
channel = parseXMLtoString(xmlParse, ".//Channel/text()")
computer = parseXMLtoString(xmlParse, ".//Computer/text()")
message = parseXMLtoString(xmlParse, ".//Data/text()")
if level == "1":
level = "Critical"
elif level == "2":
level = "Error"
elif level == "3":
level = "Warning"
date = timestamp[0:10]
time = timestamp[11:19]
time = time.replace(".", "")
_date = datetime.strptime(date, "%Y-%m-%d").date()
if _fromDate <= _date <= _toDate:
message = message.replace("<string>", "")
message = message.replace("</string>", "")
message = message.replace("\r\n", " ")
message = message.replace("\n\r", " ")
message = message.replace("\n", " ")
message = message.replace("\r", " ")
outText = date + " " + time + "|" + level + "|" + message.strip() + "|" + task + "|" + computer + "|" + providerName + "|" + qualifiers + "|" + eventID + "|" + eventRecordID + "|" + keywords + "\n"
tempFile.writelines(outText)
with codecs.open(tempFilePath, "r", "utf-8", "ignore") as tempFile2:
myLinesFromDateRange = tempFile2.readlines()
#delete the temporary file that was created
os.remove(tempFilePath)
if len(myLinesFromDateRange) > 0:
createFolder("\\filtered_data_files\\")
outFilename = "windows_" + channel.lower() + "_event_viewer_logs" + ext
myLinesFromDateRange.sort()
#remove duplicate records from the list
myFinalLinesFromDateRange = list(set(myLinesFromDateRange))
myFinalLinesFromDateRange.sort()
with codecs.open(os.getcwd() + "\\filtered_data_files\\" + outFilename, "a+", "utf-8", "ignore") as linesFromDateRange:
linesFromDateRange.seek(0)
if len(linesFromDateRange.read(100)) > 0:
linesFromDateRange.writelines("\n")
linesFromDateRange.writelines(myFinalLinesFromDateRange)
del myLinesFromDateRange[:]
del myFinalLinesFromDateRange[:]
else:
print("No data was found within the specified date range.")
print("Closing: " + filenameWithExt)
I hope it helps you or someone else in the future.
EDIT:
The "tempFilePath" can be anything you want, for example:
tempFilePath = os.getcwd() + "\\tempFile.txt"
I collected some information first before calling the "evtxFile" function:
The "From" and the "To" dates are in the following format: YYYY-MM-DD
Converted the dates to "date" data type:
_fromDate = datetime.strptime(fromDate, "%Y-%m-%d").date()
_toDate = datetime.strptime(toDate, "%Y-%m-%d").date()
Divided the directory where the .evtx files are located into different parts:
def splitDirectory(root, file):
absolutePathOfFile = os.path.join(root, file)
filePathWithoutFilename = os.path.split(absolutePathOfFile)[0]
filenameWithExt = os.path.split(absolutePathOfFile)[1]
filenameWithoutExt = os.path.splitext(filenameWithExt)[0]
extension = os.path.splitext(filenameWithExt)[1]
return absolutePathOfFile, filePathWithoutFilename, filenameWithExt, filenameWithoutExt, extension
for root, subFolders, files in os.walk(directoryPath):
for f in files:
absolutePathOfFile, filePathWithoutFilename, filenameWithExt,
filenameWithoutExt, extension = splitDirectory(root, f)
if extension == ".evtx":
evtxFile(absolutePathOfFile, filenameWithExt, ".txt", _fromDate, _toDate)

Alternative to Bio.Entrez EFetch for downloading full genome sequences from NCBI

My goal is to download full metazoan genome sequences from NCBI. I have a list of unique ID numbers for the genome sequences I need. I planned to use the Bio.Entrez module EFetch to download the data but learned today via the Nov 2, 2011 release notes (http://1.usa.gov/1TA5osg) that EFetch does not support the 'Genome' database. Can anyone suggest an alternative package/module or some other way around this? Thank you in advance!
Here is a script for you -- though you may need to tinker with it to make it work. Name the script whatever you prefer, but when you call the script do so as follows:
python name_of_script[with .py extension] your_email_address.
You need to add your email to the end of the call else it will not work. If you have a text file of accession numbers (1/line), then choose option 2. If you choose option 1, it will ask you for items like the name of the organism, strain name, and keywords. Use as many keywords as you would like -- just be certain to separate them by commas. If you go with the first option, NCBI will be searched and will return GI numbers [NOTE: NCBI is phasing out the GI numbers in 9.2016 so this script may not work after this point] which will then be used to snag the accession numbers. Once all the accession numbers are present, a folder is created, and a subfolder is created for each accession number (named as the accession number). In each subfolder, the corresponding fasta AND genbank file will be downloaded. These files will carry the accession number as the file name (e.g. accession_number.fa, accession_number.gb). Edit script to your purposes.
ALSO...Please note the warning (ACHTUNG) portion of the script. Sometimes the rules can be bent...but if you are egregious enough, your IP may be blocked from NCBI. You have been warned.
import os
import os.path
import sys
import re #regular expressions
from Bio import Entrez
import datetime
import time
import glob
arguments = sys.argv
Entrez.email = arguments[1] #email
accession_ids = []
print('Select method for obtaining the accession numbers?\n')
action = input('1 -- Input Search Terms\n2 -- Use text file\n')
if action == '1':
print('\nYou will be asked to enter an organism name, a strain name, and keywords.')
print('It is not necessary to provide a value to each item (you may just hit [ENTER]), but you must provide at least one item.\n')
organism = input('Enter the organism you wish to search for (e.g. Escherichia coli [ENTER])\n')
strain = input('Enter the strain you wish to search for. (e.g., HUSEC2011 [ENTER])\n')
keywords = input('Enter the keywords separated by a comma (e.g., complete genome, contigs, partial [ENTER])\n')
search_phrase = ''
if ',' in keywords:
keywords = keywords.split(',')
ncbi_terms = ['organism', 'strain', 'keyword']
ncbi_values = [organism, strain, keywords]
for index, n in enumerate(ncbi_values):
if index == 0 and n != '':
search_phrase = '(' + n + '[' + ncbi_terms[index] + '])'
else:
if n != '' and index != len(ncbi_values)-1:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is not list:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is list:
for name in n:
name = name.lstrip()
search_phrase = search_phrase + ' AND (' + name + '[' + ncbi_terms[index] + '])'
print('Here is the complete search line that will be used: \n\n', search_phrase)
handle = Entrez.esearch(db='nuccore', term=search_phrase, retmax=1000, rettype='acc', retmode='text')
result = Entrez.read(handle)
handle.close()
#print(result['Count'])
gi_numbers = result['IdList']
fetch_handle = Entrez.efetch(db='nucleotide', id=result['IdList'], rettype='acc', retmode='text')
accession_ids = [id.strip() for id in fetch_handle]
fetch_handle.close()
if action == '2': #use this option if you have a file of accession #s
file_name = input('Enter the name of the file\n')
with open(file_name, 'r') as input_file:
lines = input_file.readlines()
for line in lines:
line = line.replace('\n', '')
accession_ids.append(line)
#--------------------------------------------------------------------------------------------------------------
#----------------------------------- Make directory to store files --------------------------------------------
new_path = 'Genbank_Files/'
if not os.path.exists(new_path):
os.makedirs(new_path)
print('You have ' + str(len(accession_ids)) + ' file(s) to download.') #print(accession_ids)
ending='.gb'
files = []
##CHECK IF FILE HAS BEEN DOWNLOADED
for dirpath, dirnames, filenames in os.walk(new_path):
for filename in [f for f in filenames if f.endswith(ending)]: #for zipped files
files.append(os.path.join(dirpath,filename))
for f in files:
f = f.rsplit('/')[-1]
f = f.replace('.gb', '')
if f in accession_ids:
ind = accession_ids.index(f)
accession_ids.pop(ind)
print('')
print('You have ' + str(len(accession_ids)) + ' file(s) to download.')
#--------------------------------------------------------------------------
###############################################################################
#---ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG----#
###############################################################################
# Call Entrez to download files
# If downloading more than 100 files...
# Run this script only between 9pm-5am Monday - Friday EST
# Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov
# Make no more than 3 requests every 1 second (Biopython takes care of this).
# Use URL parameter email & tool for distributed software
# NCBI's Disclaimer and Copyright notice must be evident to users of your service.
#
# Use this script at your own risk.
# Neither the script author nor author's employers are responsible for consequences arising from improper usage
###############################################################################
# CALL ENTREZ: Call Entrez to download genbank AND fasta (nucleotide) files using accession numbers.
###############################################################################
start_day = datetime.date.today().weekday() # 0 is Monday, 6 is Sunday
start_time = datetime.datetime.now().time()
print(str(start_day), str(start_time))
print('')
if ((start_day < 5 and start_time > datetime.time(hour=21)) or (start_day < 5 and start_time < datetime.time(hour=5)) or start_day > 5 or len(accession_ids) <= 100 ):
print('Calling Entrez...')
for a in accession_ids:
if ((datetime.date.today().weekday() < 5 and datetime.datetime.now().time() > datetime.time(hour=21)) or
(datetime.date.today().weekday() < 5 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() == start_day + 1 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() > 5) or len(accession_ids) <= 100 ):
print('Downloading ' + a)
new_path = 'Genbank_Files/' + a + '/'
if not os.path.exists(new_path):
os.makedirs(new_path)
handle=Entrez.efetch(db='nucleotide', id=a, rettype='gb', retmode='text', seq_start=0)
FILENAME = new_path + a + '.gb'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
handle=Entrez.efetch(db='nucleotide', id=a, rettype='fasta', retmode='text')
FILENAME = new_path + a + '.fna'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
else:
print('You have too many files to download at the time. Try again later.')
#-------

Increment file name while writing file in Python

My code works and increments filename but only for two first files, after that it creates new strings in existing second file. Please help me upgrade code to increment go further.
text = 'some text'
file_path = '/path/to/file'
filename = 'textfile'
i = 1
txtfile = self.file_path + filename + str(i) + '.txt'
if not os.path.exists(txtfile):
text_file = open(txtfile, "a")
text_file.write(self.text)
text_file.close()
elif os.path.exists(txtfile) and i >= 1:
i += 1
text_file1 = open(self.file_path + filename + str(i) + '.txt', "a")
text_file1.write(self.text)
text_file1.close()
If your example is part of a loop, your resetting i to 1 in every iteration. Put the i=1 outside of this part.
And it will also start at 1 when you restart your program - sometimes not what you want.

Categories