Related
I tried to generate a file named, for example, 23-10-2022|21-03-11.xml or if the user enters his own name userGeneratedName23-10-2022-21-03-11.xml. I don't know why when I tried to specify a particular folder where the generated file should be saved the program throws me an Invalid argument error. I suspect that I am using join incorrectly, I don't really know how to correct it
if not os.path.exists("Generated XMLs"):
os.makedirs("Generated XMLs")
#open file
today = date.today()
now = datetime.now()
#if filename is not specified, create file with today's date and time of creation
if filename == "":
filename = today.strftime("%d-%m-%Y") +"|"+ now.strftime("%H-%M-%S")
#if filename is specified,ad at the and of filename today's date and time of creation
else:
filename = filename + today.strftime("%d-%m-%Y")+"|"+ now.strftime("%H-%M-%S")
# open file wchich is in Generated XMLs folder and name it with variable filename
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")
#write to file
#write header
file.write("\n<root>\n<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
file.write("\t<POZYCJE>\n")
#write data
# ask user if he wont netto or brutto meters and ad flag
flag = input("Enter 'n' if you want to use netto meters or 'b' if you want to use brutto meters: ")
for i in range(len(convertedData)):
file.write("\t\t<POZYCJA>\n")
file.write("\t\t\t<LP>" + (i+1) + "</LP>\n")
file.write("\t\t\t<TOWAR>\n")
file.write("\t\t\t\t<KOD>" + convertedData[i][0] + " " + convertedData[i][1] + "</KOD>\n")
file.write("\t\t\t\t<NAZWA>" + convertedData[i][0] + " " + convertedData[i][1] + "</NAZWA>\n")
file.write("\t\t\t\t<OPIS/>")
file.write("\t\t\t\t<EAN/>")
file.write("\t\t\t\t<SWW/>")
file.write("\t\t\t\t<NUMER_KATALOGOWY/>")
file.write("\t\t\t\t<MPP>" + "0" + "</MPP>\n")
file.write("\t\t\t</TOWAR>\n")
file.write("\t\t\t<STAWKA_VAT>\n")
file.write("\t\t\t\t<STAWKA>" + "23.00" + "</STAWKA>\n")
file.write("\t\t\t\t<FLAGA>" + "2" + "</FLAGA>\n")
file.write("\t\t\t\t<ZRODLOWA>" + "0.00" + "</ZRODLOWA>\n")
file.write("\t\t\t</STAWKA_VAT>\n")
file.write("\t\t\t<CENY>\n")
file.write("\t\t\t\t<CENAZCZTEREMAMIEJSCAMI>0</CENAZCZTEREMAMIEJSCAMI>\n")
file.write("\t\t\t\t<POCZATKOWA_WAL_CENNIKA>00.0000</POCZATKOWA_WAL_CENNIKA>\n")
file.write("\t\t\t\t<POCZATKOWA_WAL_DOKUMENTU>00.0000</POCZATKOWA_WAL_DOKUMENTU>\n")
file.write("\t\t\t\t<PO_RABACIE_WAL_CENNIKA>00.0000</PO_RABACIE_WAL_CENNIKA>\n")
file.write("\t\t\t\t<PO_RABACIE_PLN>00.0000</PO_RABACIE_PLN>\n")
file.write("\t\t\t\t<PO_RABACIE_WAL_DOKUMENTU>00.0000</PO_RABACIE_WAL_DOKUMENTU>\n")
file.write("\t\t\t</CENY>\n")
file.write("\t\t\t<WALUTA>\n")
file.write("\t\t\t\t<SYMBOL>PLN</SYMBOL>\n")
file.write("\t\t\t\t<KURS_L>1.00</KURS_L>\n")
file.write("\t\t\t\t<KURS_M>1</KURS_M>\n")
file.write("\t\t\t</WALUTA>\n")
file.write("\t\t\t<RABAT>0.00</RABAT>\n")
file.write("\t\t\t<WARTOSC_NETTO>0.00</WARTOSC_NETTO>\n")
file.write("\t\t\t<WARTOSC_BRUTTO>0.00</WARTOSC_BRUTTO>\n")
file.write("\t\t\t<WARTOSC_NETTO_WAL>00.00</WARTOSC_NETTO_WAL>\n")
file.write("\t\t\t<WARTOSC_BRUTTO_WAL>833.94</WARTOSC_BRUTTO_WAL>\n")
if flag == "n":
file.write("\t\t\t<ILOSC>" + convertedData[i][3] + "00" + "</ilosc>\n")
elif flag == "b":
file.write("\t\t\t<ILOSC>" + convertedData[i][2] + "00" + "</ilosc>\n")
else:
print("Error: Wrong flag. Enter 'n' or 'b'.")
file.write("\t\t\t<JB>" + convertedData[i][4] + "</JB>\n")
file.write("\t\t\t<JM_CALKOWITE>0.00</JM_CALKOWITE>\n")
file.write("\t\t\t<JM_ZLOZONA>\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t\t\n")
file.write("\t\t\t</JM_ZLOZONA>\n")
file.write("\t\t\t<JMZ>" + convertedData[i][4] + "</JMZ>\n")
file.write("\t\t\t<JM_PRZELICZNIK_L>1.00</JM_PRZELICZNIK_L>\n")
file.write("\t\t\t<JM_PRZELICZNIK_M>1</JM_PRZELICZNIK_M>\n")
file.write("\t\t</POZYCJA>\n")
#write footer
file.write("\t</POZYCJE>\n")
file.write("</root>")
#close file
file.close()
The exact error I get:
Traceback (most recent call last):
File "C:\Users\reczul\PycharmProjects\pythonProject5\main.py", line 23, in <module>
main()
File "C:\Users\reczul\PycharmProjects\pythonProject5\main.py", line 13, in main
XMLCreator.CreateXML(DataConverter.ConvertData(data))
File "C:\Users\reczul\PycharmProjects\pythonProject5\venv\Functions\XMLCreator.py", line 20, in CreateXML
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")
OSError: [Errno 22] Invalid argument: 'Generated XMLs\\24-10-2022|09-23-45.xml'
Process finished with exit code 1
I can not use "|" also I had to convert filename to str.
if not os.path.exists("Generated XMLs"):
os.makedirs("Generated XMLs")
#open file
today = date.today()
now = datetime.now()
#if filename is not specified, create file with today's date and time of creation
if filename == "":
filename = today.strftime("%d-%m-%Y") +"--"+ now.strftime("%H-%M-%S")
#if filename is specified,ad at the and of filename today's date and time of creation
else:
filename = filename + today.strftime("%d-%m-%Y")+"--"+ now.strftime("%H-%M-%S")
filename = str(filename)
# open file wchich is in Generated XMLs folder and name it with variable filename
#file = open(os.path.join(os.path.dirname(os.path.abspath(__file__)),"Generated XMLs",filename + ".xml"), "w")
#cerate XML file which is in Generated XMLs folder
file = open(os.path.join("Generated XMLs", filename + ".xml"), "w")
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 2 years ago.
Improve this question
I have a python script that writes to several file formats via Pandas. It can write to CSV/JSON/HTML/Excel.
However for some reason the script is writing blank files. When I open the file this is what I see:
Before printing the file I am printing the dataframe to the screen output so I can validate that the data is there. For example with CSV the output to the screen is this:
CSV data: ,AWS Account,Account Number,Name,Instance ID,AMI ID,Volumes,Private IP,Public IP,Private DNS,Availability Zone,VPC ID,Type,Key Pair Name,State,Launch Date
0,project-client-lab,123456789101,bastion001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-xxxxxxxxxxxxxxx,10.238.2.166,3.214.15.175,ip-10-238-2-166.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,t3.small,project-client-int01,running,March 10 2020
1,project-client-lab,123456789101,logicmonitor001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-0xxxxxxxxxxxxxx,10.238.2.52,,ip-10-238-2-52.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,September 02 2019
2,project-client-lab,123456789101,project-cassandra001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.221,,ip-10-238-2-221.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,January 14 2020
3,project-client-lab,123456789101,project-cassandra003,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.207,,ip-10-238-2-207.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,January 14 2020
4,project-client-lab,123456789101,project-cassandra003,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.203,,ip-10-238-2-203.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,c5.xlarge,project-client-int01,running,January 22 2020
5,project-client-lab,123456789101,project-cassandra001,i-xxxxxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,"vol-xxxxxxxxxxxxxxxxxx, vol-xxxxxxxxxxxxxxxxx",10.238.2.209,,ip-10-238-2-209.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,c5.xlarge,project-client-int01,running,January 22 2020
6,project-client-lab,123456789101,haproxy001,i-xxxxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-xxxxxxxxxxxxxxxxxx,10.238.2.169,54.242.118.165,ip-10-238-2-169.ec2.internal,us-east-1a,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,February 20 2020
7,project-client-lab,123456789101,logicmonitor002,i-xxxxxxxxxxxxxxx,ami-xxxxxxxxxxxxxxxxxxx,vol-0c48ff6ebb031008a,10.238.2.69,,ip-10-238-2-69.ec2.internal,us-east-1b,vpc-xxxxxxxxxxxxxxxxx,m5.large,project-client-int01,running,September 13 2019
These are the functions that write to file:
def mongo_export_to_file(interactive, aws_account, aws_account_number,instance_col=None,date=None):
create_directories()
if date == None:
format= "%m-%d-%Y"
today = datetime.today()
today = today.strftime(format)
date = today
else:
format= "%m-%d-%Y"
date = datetime.strptime(date,"%m%d%Y")
date = date.strftime(format)
if not instance_col:
_, _, instance_col = set_db()
# make an API call to the MongoDB server
if interactive == 0:
mongo_docs = instance_col.find({})
else:
mongo_docs = instance_col.find({"Account Number": aws_account_number})
# Convert the mongo docs to a DataFrame
docs = pandas.DataFrame(mongo_docs)
# Discard the Mongo ID for the documents
docs.pop("_id")
if __name__ == "__main__":
print("Choose a file format")
print("1. CSV")
print("2. JSON")
print("3. HTML")
print("4. Excel")
choice = input("Enter a number 1-4: ")
choice = int(choice)
else:
choice = 1
if choice == 1:
if __name__ == "__main__":
# export MongoDB documents to CSV
csv_export = docs.to_csv(sep=",") # CSV delimited by commas
print ("\nCSV data:", csv_export)
# Set the CSV output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "csv", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".csv")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date +".csv")
# export MongoDB documents to a CSV file, leaving out the row "labels" (row numbers)
docs.to_csv(output_file, ",", index=False) # CSV delimited by commas
elif choice == 2:
if __name__ == "__main__":
json_export = docs.to_json() # return JSON data
print ("\nJSON data:", json_export)
# Set the JSON output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "json", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".json")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date +".json")
# export MongoDB documents to a CSV file, leaving out the row "labels" (row numbers)
docs.to_json(output_file)
elif choice == 3:
html_str = io.StringIO()
# export as HTML
docs.to_html(
buf=html_str,
classes="table table-striped"
)
if __name__ == "__main__":
# print out the HTML table
print (html_str.getvalue())
# Set the HTML output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "html", "")
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date +".html")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date + ".html")
# save the MongoDB documents as an HTML table
docs.to_html(output_file)
elif choice == 4:
# Set the Excel output directory
output_dir = os.path.join("..", "..", "output_files", "aws_instance_list", "excel", "")
time.sleep(5)
if interactive == 1:
output_file = os.path.join(output_dir, "aws-instance-list-" + aws_account + "-" + date + ".xlsx")
else:
output_file = os.path.join(output_dir, "aws-instance-master-list-" + date + ".xlsx")
# export MongoDB documents to a Excel file, leaving out the row "labels" (row numbers)
writer = ExcelWriter(output_file)
docs.to_excel(writer,"EC2 List",index=False)
writer.save()
writer.close()
if __name__ == "__main__":
exit = input("Exit program (y/n): ")
if exit.lower() == "y" or exit.lower() == "yes":
exit_program()
else:
main()
def print_reports(interactive,aws_account,aws_account_number):
set_db(instance_col=None)
inputDate = input("Enter the date in format 'dd/mm/yyyy': ")
day,month,year = inputDate.split('/')
isValidDate = True
try:
datetime(int(year),int(month),int(day))
except ValueError :
isValidDate = False
print_reports(interactive,aws_account,aws_account_number)
if(isValidDate) :
print(f"Input date is valid: {inputDate}")
format= "%m%d%Y"
inputDate = datetime.strptime(inputDate,"%m/%d/%Y")
inputDate = inputDate.strftime(format)
else:
print(f"Input date is not valid: {inputDate}")
print_reports(interactive,aws_account,aws_account_number)
myclient = connect_db()
mydb = myclient["aws_inventories"]
instance_col = "ec2_list_" + inputDate
instance_col = mydb[instance_col]
mongo_export_to_file(interactive, aws_account, aws_account_number,instance_col,date=inputDate)
This is all my code in this script.
Why is this happening and how to I correct that?
You can view the file in Excel by:
Opening Excel
Going to the "Data" tab
In the "Get & Transform Data" section, click "From Text/CSV"
Guys do anyone know how to read event log file in C:\Windows\System32\winevt\Logs with .evtx extension?
I have already tried to open it using notepad and read using python but notepad says access is denied...
Do anyone know how to do it? Thanks in advance..
This is how you would read the file "Forwarded Events" from the event viewer. You need admin access so I would run it as admin but I it will prompt you for a password if you don't.
import win32evtlog
import xml.etree.ElementTree as ET
import ctypes
import sys
def is_admin():
try:
return ctypes.windll.shell32.IsUserAnAdmin()
except:
return False
if is_admin():
# open event file
query_handle = win32evtlog.EvtQuery(
'C:\Windows\System32\winevt\Logs\ForwardedEvents.evtx',
win32evtlog.EvtQueryFilePath)
read_count = 0
a = 1
while a == 1:
a += 1
# read 1 record(s)
events = win32evtlog.EvtNext(query_handle, 1)
read_count += len(events)
# if there is no record break the loop
if len(events) == 0:
break
for event in events:
xml_content = win32evtlog.EvtRender(event, win32evtlog.EvtRenderEventXml)
# parse xml content
xml = ET.fromstring(xml_content)
# xml namespace, root element has a xmlns definition, so we have to use the namespace
ns = '{http://schemas.microsoft.com/win/2004/08/events/event}'
substatus = xml[1][9].text
event_id = xml.find(f'.//{ns}EventID').text
computer = xml.find(f'.//{ns}Computer').text
channel = xml.find(f'.//{ns}Channel').text
execution = xml.find(f'.//{ns}Execution')
process_id = execution.get('ProcessID')
thread_id = execution.get('ThreadID')
time_created = xml.find(f'.//{ns}TimeCreated').get('SystemTime')
#data_name = xml.findall('.//EventData')
#substatus = data_name.get('Data')
#print(substatus)
event_data = f'Time: {time_created}, Computer: {computer}, Substatus: {substatus}, Event Id: {event_id}, Channel: {channel}, Process Id: {process_id}, Thread Id: {thread_id}'
print(event_data)
user_data = xml.find(f'.//{ns}UserData')
# user_data has possible any data
else:
ctypes.windll.shell32.ShellExecuteW(None, "runas", sys.executable, " ".join(sys.argv), None, 1)
input()
.evtx is the extension for Windows Eventlog files. It contains data in a special binary format designed by Microsoft so you cannot simply open it in a text editor.
The are open source tools to read .evtx and the NXLog EE can also read .evtx files. (Disclaimer: I'm affiliated with the latter).
I modified the accepted answer a bit as following, so it becomes reusable:
import xml.etree.ElementTree as Et
import win32evtlog
from collections import namedtuple
class EventLogParser:
def __init__(self, exported_log_file):
self.exported_log_file = exported_log_file
def get_all_events(self):
windows_events = []
query_handle = win32evtlog.EvtQuery(str(self.exported_log_file),
win32evtlog.EvtQueryFilePath | win32evtlog.EvtQueryReverseDirection)
while True:
raw_event_collection = win32evtlog.EvtNext(query_handle, 1)
if len(raw_event_collection) == 0:
break
for raw_event in raw_event_collection:
windows_events.append(self.parse_raw_event(raw_event))
return windows_events
def parse_raw_event(self, raw_event):
xml_content = win32evtlog.EvtRender(raw_event, win32evtlog.EvtRenderEventXml)
root = Et.fromstring(xml_content)
ns = "{" + root.tag.split('}')[0].strip('{') + "}"
system = root.find(f'{ns}System')
event_id = system.find(f'{ns}EventID').text
level = system.find(f'{ns}Level').text
time_created = system.find(f'{ns}TimeCreated').get('SystemTime')
computer = system.find(f'{ns}Computer').text
WindowsEvent = namedtuple('WindowsEvent',
'event_id, level, time_created, computer')
return WindowsEvent(event_id, level, time_created, computer)
I use the "python-evtx" library, you can install it using this command:
pip install python-evtx
In my case, I'm not interested in reading records with the "Information" level.
import os
import codecs
from lxml import etree
import Evtx.Evtx as evtx
def evtxFile(absolutePath, filenameWithExt, ext, _fromDate, _toDate):
print("Reading: " + filenameWithExt)
outText = ""
channel = ""
#read the windows event viewer log and convert its contents to XML
with codecs.open(tempFilePath, "a+", "utf-8", "ignore") as tempFile:
with evtx.Evtx(absolutePath) as log:
for record in log.records():
xmlLine = record.xml()
xmlLine = xmlLine.replace(" xmlns=\"http://schemas.microsoft.com/win/2004/08/events/event\"", "")
xmlParse = etree.XML(xmlLine)
level = parseXMLtoString(xmlParse, ".//Level/text()")
if not level == "0" and not level == "4":
providerName = parseXMLtoString(xmlParse, ".//Provider/#Name")
qualifiers = parseXMLtoString(xmlParse, ".//EventID/#Qualifiers")
timestamp = parseXMLtoString(xmlParse, ".//TimeCreated/#SystemTime")
eventID = parseXMLtoString(xmlParse, ".//EventID/text()")
task = parseXMLtoString(xmlParse, ".//Task/text()")
keywords = parseXMLtoString(xmlParse, ".//Keywords/text()")
eventRecordID = parseXMLtoString(xmlParse, ".//EventRecordID/text()")
channel = parseXMLtoString(xmlParse, ".//Channel/text()")
computer = parseXMLtoString(xmlParse, ".//Computer/text()")
message = parseXMLtoString(xmlParse, ".//Data/text()")
if level == "1":
level = "Critical"
elif level == "2":
level = "Error"
elif level == "3":
level = "Warning"
date = timestamp[0:10]
time = timestamp[11:19]
time = time.replace(".", "")
_date = datetime.strptime(date, "%Y-%m-%d").date()
if _fromDate <= _date <= _toDate:
message = message.replace("<string>", "")
message = message.replace("</string>", "")
message = message.replace("\r\n", " ")
message = message.replace("\n\r", " ")
message = message.replace("\n", " ")
message = message.replace("\r", " ")
outText = date + " " + time + "|" + level + "|" + message.strip() + "|" + task + "|" + computer + "|" + providerName + "|" + qualifiers + "|" + eventID + "|" + eventRecordID + "|" + keywords + "\n"
tempFile.writelines(outText)
with codecs.open(tempFilePath, "r", "utf-8", "ignore") as tempFile2:
myLinesFromDateRange = tempFile2.readlines()
#delete the temporary file that was created
os.remove(tempFilePath)
if len(myLinesFromDateRange) > 0:
createFolder("\\filtered_data_files\\")
outFilename = "windows_" + channel.lower() + "_event_viewer_logs" + ext
myLinesFromDateRange.sort()
#remove duplicate records from the list
myFinalLinesFromDateRange = list(set(myLinesFromDateRange))
myFinalLinesFromDateRange.sort()
with codecs.open(os.getcwd() + "\\filtered_data_files\\" + outFilename, "a+", "utf-8", "ignore") as linesFromDateRange:
linesFromDateRange.seek(0)
if len(linesFromDateRange.read(100)) > 0:
linesFromDateRange.writelines("\n")
linesFromDateRange.writelines(myFinalLinesFromDateRange)
del myLinesFromDateRange[:]
del myFinalLinesFromDateRange[:]
else:
print("No data was found within the specified date range.")
print("Closing: " + filenameWithExt)
I hope it helps you or someone else in the future.
EDIT:
The "tempFilePath" can be anything you want, for example:
tempFilePath = os.getcwd() + "\\tempFile.txt"
I collected some information first before calling the "evtxFile" function:
The "From" and the "To" dates are in the following format: YYYY-MM-DD
Converted the dates to "date" data type:
_fromDate = datetime.strptime(fromDate, "%Y-%m-%d").date()
_toDate = datetime.strptime(toDate, "%Y-%m-%d").date()
Divided the directory where the .evtx files are located into different parts:
def splitDirectory(root, file):
absolutePathOfFile = os.path.join(root, file)
filePathWithoutFilename = os.path.split(absolutePathOfFile)[0]
filenameWithExt = os.path.split(absolutePathOfFile)[1]
filenameWithoutExt = os.path.splitext(filenameWithExt)[0]
extension = os.path.splitext(filenameWithExt)[1]
return absolutePathOfFile, filePathWithoutFilename, filenameWithExt, filenameWithoutExt, extension
for root, subFolders, files in os.walk(directoryPath):
for f in files:
absolutePathOfFile, filePathWithoutFilename, filenameWithExt,
filenameWithoutExt, extension = splitDirectory(root, f)
if extension == ".evtx":
evtxFile(absolutePathOfFile, filenameWithExt, ".txt", _fromDate, _toDate)
My goal is to download full metazoan genome sequences from NCBI. I have a list of unique ID numbers for the genome sequences I need. I planned to use the Bio.Entrez module EFetch to download the data but learned today via the Nov 2, 2011 release notes (http://1.usa.gov/1TA5osg) that EFetch does not support the 'Genome' database. Can anyone suggest an alternative package/module or some other way around this? Thank you in advance!
Here is a script for you -- though you may need to tinker with it to make it work. Name the script whatever you prefer, but when you call the script do so as follows:
python name_of_script[with .py extension] your_email_address.
You need to add your email to the end of the call else it will not work. If you have a text file of accession numbers (1/line), then choose option 2. If you choose option 1, it will ask you for items like the name of the organism, strain name, and keywords. Use as many keywords as you would like -- just be certain to separate them by commas. If you go with the first option, NCBI will be searched and will return GI numbers [NOTE: NCBI is phasing out the GI numbers in 9.2016 so this script may not work after this point] which will then be used to snag the accession numbers. Once all the accession numbers are present, a folder is created, and a subfolder is created for each accession number (named as the accession number). In each subfolder, the corresponding fasta AND genbank file will be downloaded. These files will carry the accession number as the file name (e.g. accession_number.fa, accession_number.gb). Edit script to your purposes.
ALSO...Please note the warning (ACHTUNG) portion of the script. Sometimes the rules can be bent...but if you are egregious enough, your IP may be blocked from NCBI. You have been warned.
import os
import os.path
import sys
import re #regular expressions
from Bio import Entrez
import datetime
import time
import glob
arguments = sys.argv
Entrez.email = arguments[1] #email
accession_ids = []
print('Select method for obtaining the accession numbers?\n')
action = input('1 -- Input Search Terms\n2 -- Use text file\n')
if action == '1':
print('\nYou will be asked to enter an organism name, a strain name, and keywords.')
print('It is not necessary to provide a value to each item (you may just hit [ENTER]), but you must provide at least one item.\n')
organism = input('Enter the organism you wish to search for (e.g. Escherichia coli [ENTER])\n')
strain = input('Enter the strain you wish to search for. (e.g., HUSEC2011 [ENTER])\n')
keywords = input('Enter the keywords separated by a comma (e.g., complete genome, contigs, partial [ENTER])\n')
search_phrase = ''
if ',' in keywords:
keywords = keywords.split(',')
ncbi_terms = ['organism', 'strain', 'keyword']
ncbi_values = [organism, strain, keywords]
for index, n in enumerate(ncbi_values):
if index == 0 and n != '':
search_phrase = '(' + n + '[' + ncbi_terms[index] + '])'
else:
if n != '' and index != len(ncbi_values)-1:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is not list:
search_phrase = search_phrase + ' AND (' + n + '[' + ncbi_terms[index] + '])'
if index == len(ncbi_values)-1 and n != '' and type(n) is list:
for name in n:
name = name.lstrip()
search_phrase = search_phrase + ' AND (' + name + '[' + ncbi_terms[index] + '])'
print('Here is the complete search line that will be used: \n\n', search_phrase)
handle = Entrez.esearch(db='nuccore', term=search_phrase, retmax=1000, rettype='acc', retmode='text')
result = Entrez.read(handle)
handle.close()
#print(result['Count'])
gi_numbers = result['IdList']
fetch_handle = Entrez.efetch(db='nucleotide', id=result['IdList'], rettype='acc', retmode='text')
accession_ids = [id.strip() for id in fetch_handle]
fetch_handle.close()
if action == '2': #use this option if you have a file of accession #s
file_name = input('Enter the name of the file\n')
with open(file_name, 'r') as input_file:
lines = input_file.readlines()
for line in lines:
line = line.replace('\n', '')
accession_ids.append(line)
#--------------------------------------------------------------------------------------------------------------
#----------------------------------- Make directory to store files --------------------------------------------
new_path = 'Genbank_Files/'
if not os.path.exists(new_path):
os.makedirs(new_path)
print('You have ' + str(len(accession_ids)) + ' file(s) to download.') #print(accession_ids)
ending='.gb'
files = []
##CHECK IF FILE HAS BEEN DOWNLOADED
for dirpath, dirnames, filenames in os.walk(new_path):
for filename in [f for f in filenames if f.endswith(ending)]: #for zipped files
files.append(os.path.join(dirpath,filename))
for f in files:
f = f.rsplit('/')[-1]
f = f.replace('.gb', '')
if f in accession_ids:
ind = accession_ids.index(f)
accession_ids.pop(ind)
print('')
print('You have ' + str(len(accession_ids)) + ' file(s) to download.')
#--------------------------------------------------------------------------
###############################################################################
#---ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG--ACHTUNG----#
###############################################################################
# Call Entrez to download files
# If downloading more than 100 files...
# Run this script only between 9pm-5am Monday - Friday EST
# Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov
# Make no more than 3 requests every 1 second (Biopython takes care of this).
# Use URL parameter email & tool for distributed software
# NCBI's Disclaimer and Copyright notice must be evident to users of your service.
#
# Use this script at your own risk.
# Neither the script author nor author's employers are responsible for consequences arising from improper usage
###############################################################################
# CALL ENTREZ: Call Entrez to download genbank AND fasta (nucleotide) files using accession numbers.
###############################################################################
start_day = datetime.date.today().weekday() # 0 is Monday, 6 is Sunday
start_time = datetime.datetime.now().time()
print(str(start_day), str(start_time))
print('')
if ((start_day < 5 and start_time > datetime.time(hour=21)) or (start_day < 5 and start_time < datetime.time(hour=5)) or start_day > 5 or len(accession_ids) <= 100 ):
print('Calling Entrez...')
for a in accession_ids:
if ((datetime.date.today().weekday() < 5 and datetime.datetime.now().time() > datetime.time(hour=21)) or
(datetime.date.today().weekday() < 5 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() == start_day + 1 and datetime.datetime.now().time() < datetime.time(hour=5)) or
(datetime.date.today().weekday() > 5) or len(accession_ids) <= 100 ):
print('Downloading ' + a)
new_path = 'Genbank_Files/' + a + '/'
if not os.path.exists(new_path):
os.makedirs(new_path)
handle=Entrez.efetch(db='nucleotide', id=a, rettype='gb', retmode='text', seq_start=0)
FILENAME = new_path + a + '.gb'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
handle=Entrez.efetch(db='nucleotide', id=a, rettype='fasta', retmode='text')
FILENAME = new_path + a + '.fna'
local_file=open(FILENAME,'w')
local_file.write(handle.read())
handle.close()
local_file.close()
else:
print('You have too many files to download at the time. Try again later.')
#-------
My code works and increments filename but only for two first files, after that it creates new strings in existing second file. Please help me upgrade code to increment go further.
text = 'some text'
file_path = '/path/to/file'
filename = 'textfile'
i = 1
txtfile = self.file_path + filename + str(i) + '.txt'
if not os.path.exists(txtfile):
text_file = open(txtfile, "a")
text_file.write(self.text)
text_file.close()
elif os.path.exists(txtfile) and i >= 1:
i += 1
text_file1 = open(self.file_path + filename + str(i) + '.txt', "a")
text_file1.write(self.text)
text_file1.close()
If your example is part of a loop, your resetting i to 1 in every iteration. Put the i=1 outside of this part.
And it will also start at 1 when you restart your program - sometimes not what you want.