XLRDError: Expected BOF record '\x03OPC ' - python

Is there an easy way to find out what the BOF record, error messages means?
A list or something like that where they can be looked up?
I have just installed XLRD 0.9.2, in addition to XLutils 1.6.0 . (I know it's an overkill, so can that might be the problem?) I am reading through a huge bunch of Excel files, where I know that all are excelfiles, at least in their filename.
The errormessage seems to show even though I've put up a Try, Except test. Here's the code where the error shows:
def locate_vals():
val_dict = {}
Fcount = 0
for filename in file_list:
try:
wb = xlrd.open_workbook(os.path.join(start_dir, filename))
sheet = wb.sheet_by_index(5) # kan ogsaa velge sheet_by_name('navn')
# model = sheet.cell_value(2, 3)
lenghtvalue = sheet.cell_value(9, 7) # (y,x)
dispvalue = sheet.cell_value(15, 7)
try:
Froudemax = max(Fdict.get(filename)[Fcount - 1], key=str)
Froudemin = min(Fdict.get(filename)[Fcount - 1])
Fcount += 1
except:
Froudemax = 5555555555555555555555555555
Froudemin = 6666666666666666666666666666
print 'Froudemax(5) eller Froudemin(6) har problem'
val_dict[filename] = [lenghtvalue, dispvalue, Froudemax, Froudemin]
except XLRDError and IndexError:
print 'Problem in locate_vals with:', filename
return val_dict
val_dict = locate_vals()
My errormessage says:
Traceback (most recent call last):
File "C:\Documents and Settings\OPC\My Documents\Haavard_Refvik_Workspace\STT_ComparisonTool\Run_Comparison_Tool.py", line 129, in <module>
val_dict = locate_vals()
File "C:\Documents and Settings\OPC\My Documents\Haavard_Refvik_Workspace\STT_ComparisonTool\Run_Comparison_Tool.py", line 111, in locate_vals
wb = xlrd.open_workbook(os.path.join(start_dir, filename))
File "C:\Python27\lib\site-packages\xlrd-0.9.2-py2.7.egg\xlrd\__init__.py", line 435, in open_workbook
ragged_rows=ragged_rows,
File "C:\Python27\lib\site-packages\xlrd-0.9.2-py2.7.egg\xlrd\book.py", line 91, in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File "C:\Python27\lib\site-packages\xlrd-0.9.2-py2.7.egg\xlrd\book.py", line 1258, in getbof
bof_error('Expected BOF record; found %r' % self.mem[savpos:savpos+8])
File "C:\Python27\lib\site-packages\xlrd-0.9.2-py2.7.egg\xlrd\book.py", line 1252, in bof_error
raise XLRDError('Unsupported format, or corrupt file: ' + msg)
xlrd.biffh.XLRDError: Unsupported format, or corrupt file: Expected BOF record; found '\x03OPC '
EDIT: OPC is the username on the computer.

Added a try:, except: test, and found the files that caused the problem. Seemed like it was some old "automatic-security-save-files" that had been saved in the same folder as the rest of my files, but where hidden. The name of these files looked like this:
~$test spreadsheet modelxxx.xlsx
I should of course have ran the test earlier, but I had put the Try function a indent too far left, so I never got to see the file that held the error. Thanks for the response.
I would still appreciate if anyone could point me in the direction of a table of error-messages though, if such a table exists.

Related

Skip line in .txt import to postgresql

I'm trying to import 5'000 .txt files into a postgresql database. My script is running fine as long as it doesn't reach a line which doesn't fit the format. For example every file has a new line at the end which also causes the script to crash.
I've tried to handle exceptions but to no success...
My script:
import csv
import os
import sys
import psycopg2
conn = psycopg2.connect(
host="localhost",
database="demo",
user="demo",
password="123",
port="5432"
)
cur = conn.cursor()
maxInt = sys.maxsize
while True:
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt / 10)
def searchFiles(directory='', extension=''):
print('SEARCHING IN: ', directory)
filelist = []
extension = extension.lower()
for dirpath, dirnames, files in os.walk(directory):
for name in files:
if extension and name.lower().endswith(extension):
filelist.append(os.path.join(dirpath, name))
elif not extension:
print('FAILED TO READ: ', (os.path.join(dirpath, name)))
print('FINISHED FILE SEARCH AND FOUND ', str(len(filelist)), ' FILES')
return filelist
def importData(fileToImport):
with open(fileToImport, 'r') as f:
reader = csv.reader(f, delimiter=':')
for line in reader:
try:
cur.execute("""INSERT INTO demo VALUES (%s, %s)""", (line[0], line[1]))
conn.commit()
except:
pass
print('FAILED AT LINE: ', line)
print(conn.get_dsn_parameters())
cur.execute("SELECT version();")
record = cur.fetchone()
print("You are connected to - ", record)
fileList = searchFiles('output', '.txt')
counter = 0
length = len(fileList)
for file in fileList:
# if counter % 10 == 0:
print('Processing File: ', str(file), ', COMPLETED: ', str(counter), '/', str(length))
importData(str(file))
counter += 1
print('FINISHED IMPORT OF ', str(length), ' FILES')
A few lines of the data I'm trying to import:
example1#example.com:123456
example2#example.com:password!1
The error I'm getting:
File "import.py", line 66, in <module>
importData(str(file))
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
How should I handle lines which can not get imported?
Thanks for any help
Your traceback shows the source of the exception in for line in reader:
File "import.py", line 45, in importData
for line in reader:
_csv.Error: line contains NULL byte
and you do not handle exceptions at that point. As the exception suggests, it is raised by your csv reader instance. While you certainly could wrap your for loop in a try-except block, your loop will still end once the exception raises.
This exception may be caused by the file having a different encoding than your locale's, which is assumed by open() if no encoding is explicitly provided:
In text mode, if encoding is not specified the encoding used is
platform dependent: locale.getpreferredencoding(False) is called to
get the current locale encoding.
The accepted answer in this Q&A outlines a solution to deal with that, provided that you can identify the correct encoding to open the file with. The Q&A also shows some approaches on how to get rid of NULL bytes in the file, prior to handing it over to a reader.
You might also want to simply skip empty lines instead of firing them to your DB and handle the exception, e.g.
for line in reader:
if not line:
continue
try:
[...]

python memoryerror - large loop xml to mongodb

I downloaded a zip file from https://clinicaltrials.gov/AllPublicXML.zip, which contains over 200k xml files (most are < 10 kb in size), to a directory (see 'dirpath_zip' in the CODE) I created in ubuntu 16.04 (using DigitalOcean). What I'm trying to accomplish is loading all of these into MongoDB (also installed in the same location as the zip file).
I ran the CODE below twice and consistently failed when processing the 15988th file.
I've googled around and tried reading other posts regarding this particular error, but couldn't find a way to solve this particular issue. Actually, I'm not really sure what problem really is... any help is much appreciated!!
CODE:
import re
import json
import zipfile
import pymongo
import datetime
import xmltodict
from bs4 import BeautifulSoup
from pprint import pprint as ppt
def timestamper(stamp_type="regular"):
if stamp_type == "regular":
timestamp = str(datetime.datetime.now())
elif stamp_type == "filename":
timestamp = str(datetime.datetime.now()).replace("-", "").replace(":", "").replace(" ", "_")[:15]
else:
sys.exit("ERROR [timestamper()]: unexpected 'stamp_type' (parameter) encountered")
return timestamp
client = pymongo.MongoClient()
db = client['ctgov']
coll_name = "ts_"+timestamper(stamp_type="filename")
coll = db[coll_name]
dirpath_zip = '/glbdat/ctgov/all/alltrials_20180402.zip'
z = zipfile.ZipFile(dirpath_zip, 'r')
i = 0
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue
else:
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
i+=1
ERROR MESSAGE:
Traceback (most recent call last):
File "zip_to_mongo_alltrials.py", line 38, in <module>
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
File "/usr/local/lib/python3.5/dist-packages/bs4/__init__.py", line 225, in __init__
markup, from_encoding, exclude_encodings=exclude_encodings)):
File "/usr/local/lib/python3.5/dist-packages/bs4/builder/_lxml.py", line 118, in prepare_markup
for encoding in detector.encodings:
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 264, in encodings
self.chardet_encoding = chardet_dammit(self.markup)
File "/usr/local/lib/python3.5/dist-packages/bs4/dammit.py", line 34, in chardet_dammit
return chardet.detect(s)['encoding']
File "/usr/lib/python3/dist-packages/chardet/__init__.py", line 30, in detect
u.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/universaldetector.py", line 128, in feed
if prober.feed(aBuf) == constants.eFoundIt:
File "/usr/lib/python3/dist-packages/chardet/charsetgroupprober.py", line 64, in feed
st = prober.feed(aBuf)
File "/usr/lib/python3/dist-packages/chardet/hebrewprober.py", line 224, in feed
aBuf = self.filter_high_bit_only(aBuf)
File "/usr/lib/python3/dist-packages/chardet/charsetprober.py", line 53, in filter_high_bit_only
aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
File "/usr/lib/python3.5/re.py", line 182, in sub
return _compile(pattern, flags).sub(repl, string, count)
MemoryError
Try to push reading from file and inserting into db in another method.
Also add gc.collect() for garbage collection.
import gc;
def read_xml_insert(xmlfile):
soup = BeautifulSoup(z.read(xmlfile), 'lxml')
json_study = json.loads(re.sub('\s', ' ', json.dumps(xmltodict.parse(str(soup.find('clinical_study'))))).strip())
coll.insert_one(json_study)
for xmlfile in z.namelist():
print(i, 'parsing:', xmlfile)
if xmlfile == 'Contents.txt':
print(xmlfile, '==> entering "continue"')
continue;
else:
read_xml_insert(xmlfile);
i+=1
gc.collect()
`
Please see.

Issue with moving a file using shutil.move()

I am trying to move a file using the shutil module in Python. I keep getting this error:
Traceback (most recent call last):
File "<input>", line 31, in <module>
File "C:\Python27\lib\shutil.py", line 302, in move
copy2(src, real_dst)
File "C:\Python27\lib\shutil.py", line 130, in copy2
copyfile(src, dst)
File "C:\Python27\lib\shutil.py", line 82, in copyfile
with open(src, 'rb') as fsrc:
IOError: [Errno 2] No such file or directory: 'MLR_Resi_Customer.txt'
I do not understand why I am getting no such file or directory. If instead of using a shutil.move(filename, new_dest) it will print the file name I am looking for.
import shutil
import os
import datetime
# set location of folders and files needed
source = '//nspinfwcipp01/BSA/marketing'
dest_cust = '//nspinfwcipp01/BSA/marketing/res_cust'
dest_pros = '//nspinfwcipp01/BSA/marketing/res_pros'
dest_win = '//nspinfwcipp01/BSA/marketing/res_win'
# create date time stamp of now
dt = str(datetime.datetime.now())
files = os.listdir(source)
print files
# create new path to storage files for old data
cust_files = os.listdir(dest_cust)
pros_files = os.listdir(dest_pros)
win_files = os.listdir(dest_win)
# new file names
new_cust = 'MLR_Resi_Customers'+dt+'.txt'
new_pros = 'MLR_Resi_Prospects'+dt+'.txt'
new_win = 'MLR_Resi_Winbacks'+dt+'.txt'
#move files from marketing folder into their own folder when after done processing
for f in files:
if (f.endswith("Customer.txt")):
print f
shutil.move(f,dest_cust)
elif (f.endswith("Prospects")):
#print f
shutil.move(f,dest_pros)
elif (f.endswith("Winbacks")):
#print f
shutil.move(f,dest_win)
##rename files in storage with data for Kalyan and Jake's Project
## rename customer file for storage
for x in cust_files:
if (x.endswith("Customers")):
#print x
new_cust = 'MLR_Resi_Customer'+dt+'.txt'
os.rename('MLR_Resi_Customers.txt', new_cust)
else:
print "no_customer_file"
## rename prospect file for storage
for x in cust_files:
if (x.endswith("Prospects")):
#print x
os.rename('MLR_Resi_Prospects.txt', new_pros)
else:
print "no_prospect_file"
## rename winback file for storage
for x in cust_files:
if (x.endswith("Winbacks")):
#print x
os.rename('MLR_Resi_Winbacks.txt', new_win)
else:
print "no_winback_file"
So I am not sure what I am doing wrong. The path to the files is correct and It seems to be able to print the file name just fine. Any help with those issues above is greatly appreciated.
Use shutil.move(glob.glob(f)[0],dest_whatever) and that should solve your problem by giving it an actual path to the file, although, if the file doesn't exist glob.glob will return an empty array.

Invalid mode or filename when using openpyxl in python 2.7

I am trying to write something in an existing workbook with the openpyxl tools.
But i am getting Err no 22 and dont know why.
My Script looks like this :
#Reading & writing to a workbook
from openpyxl import Workbook
from openpyxl.compat import range
from openpyxl.cell import get_column_letter
wb = Workbook()
dest_filename = 'J:\Python_Script\book2.xls'
ws = wb.active
ws.title = "Tabelle 1"
for col_idx in range(1, 40):
col = get_column_letter(col_idx)
for row in range(1, 600):
ws.cell('%s%s'%(col, row)).value = '%s%s' % (col, row)
ws = wb.create_sheet()
ws.title = 'Pi'
ws['F5'] = 3.14
wb.save(filename = dest_filename)
and this is the Console output with the error message i got :
//------------------
Traceback (most recent call last):
File "J:/Python_Script/xlsx_test.py", line 26, in <module>
wb.save(filename = dest_filename)
File "build\bdist.win32\egg\openpyxl\workbook\workbook.py", line 281, in save
save_workbook(self, filename)
File "build\bdist.win32\egg\openpyxl\writer\excel.py", line 214, in save_workbook
writer.save(filename)
File "build\bdist.win32\egg\openpyxl\writer\excel.py", line 196, in save
archive = ZipFile(filename, 'w', ZIP_DEFLATED)
File "C:\Python27\lib\zipfile.py", line 752, in __init__
self.fp = open(file, modeDict[mode])
IOError: [Errno 22] invalid mode ('wb') or filename: 'J:\\Python_Script\x08ook2.xls'
//----------------------
I am not sure why the file path is different now, also the file name different from the filename in the input section.
thanks
EDIT:
Solved. Just had to change from \ to / in the path.
In Python the \ is used in strings to escape characters. You can avoid this by using "raw strings" by prefixing with "r". So r'J:\Python_Script\book2.xls' should work.
However, when working with paths it's most common to use the os.path module to make sure this are correct.
dest_filename = os.path.join("J:", "Python_Script", "book2.xlsx")
This is invaluable when writing portable code.

Gzip problem,traceback and IOError: [Errno 2] No such file or directory

I'm new to python and bioinformatics field. I'm using python-2.6. Now I'm trying to select all fastq.gz files, then gzip.open(just a few lines because it's too huge and time-wasting), then count 'J' , then pick out those files with 'J' count NOT equal to 0.
The following is my code:
#!/usr/bin/python
import os,sys,re,gzip
path = "/home/XXX/nearline"
for file in os.listdir(path):
if re.match('.*\.recal.fastq.gz', file):
text = gzip.open(file,'r').readlines()[:10]
word_list = text.split()
number = word_list.count('J') + 1
if number !== 0:
print file
But I got some errors:
Traceback (most recent call last):
File "fastqfilter.py", line 9, in <module>
text = gzip.open(file,'r').readlines()[:10]
File "/share/lib/python2.6/gzip.py", line 33, in open
return GzipFile(filename, mode, compresslevel)
File "/share/lib/python2.6/gzip.py", line 79, in __init__
fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
IOError: [Errno 2] No such file or directory: 'ERR001268_1.recal.fastq.gz'
What's this traceback: File......
Is there anything wrong with gzip here?
And why can't it find ERR001268_1.recal.fastq.gz? It's the first fastq file in the list, and DOES exist there.
Hope give me some clues, and any point out any other errors in the script.
THanks a lot.
Edit: thx everyone. I followed Dan's suggestion. And I tried on ONE fastq file first. My script goes like:
#!/usr/bin/python
import os,sys
import gzip
import itertools
file = gzip.open('/home/xug/nearline/ERR001274_1.recal.fastq.gz','r')
list(itertools.islice(file.xreadlines(),10))
word_list = list.split()
number = word_list.count('J') + 1
if number != 0:
print 'ERR001274_1.recal.fastq.gz'
Then errors are:
Traceback (most recent call last):
File "try2.py", line 8, in <module>
list(itertools.islice(text.xreadlines(),10))
AttributeError: GzipFiles instance has no attribute 'xreadlines'
Edit again: Thx Dan, I've solved the problem yesterday. Seems GzipFiles don't support xreadlines. So I tried the similar way as you suggested later. And it works. See below:
#!/usr/bin/python
import os,sys,re
import gzip
from itertools import islice
path = "/home/XXXX/nearline"
for file in os.listdir(path):
if re.match('.*\.recal.fastq.gz', file):
fullpath = os.path.join(path, file)
myfile = gzip.open(fullpath,'r')
head = list(islice(myfile,1000))
word_str = ";".join(str(x) for x in head)
number = word_str.count('J')
if number != 0:
print file
on this line:
text = gzip.open(file,'r').read()
file is a filename not a full path so
fullpath = os.path.join(path, file)
text = gzip.open(fullpath,'r').read()
about F.readlines()[:10] will read the whole file in to a list of lines and then take the first 10
import itertools
list(itertools.islice(F.xreadlines(),10))
this will not read the whole file into memory and will only read the first 10 lines into a list
but as gzip.open returns an object that doesn't have .xreadlines() and but as files are iterable on their lines just:
list(itertools.islice(F,10))
would work as this test shows:
>>> import gzip,itertools
>>> list(itertools.islice(gzip.open("/home/dan/Desktop/rp718.ps.gz"),10))
['%!PS-Adobe-2.0\n', '%%Creator: dvips 5.528 Copyright 1986, 1994 Radical Eye Software\n', '%%Title: WLP-94-int.dvi\n', '%%CreationDate: Mon Jan 16 16:24:41 1995\n', '%%Pages: 6\n', '%%PageOrder: Ascend\n', '%%BoundingBox: 0 0 596 842\n', '%%EndComments\n', '%DVIPSCommandLine: dvips -f WLP-94-int.dvi\n', '%DVIPSParameters: dpi=300, comments removed\n']
Change your code to:
#!/usr/bin/python
import os,sys,re,gzip
path = "/home/XXX/nearline"
for file in os.listdir(path):
if re.match('.*\.recal.fastq.gz', file):
text = gzip.open(os.path.join(path,file),'r').readlines()[:10]
word_list = text.split()
number = word_list.count('J') + 1
if number !== 0:
print file
It's trying to open ERR001268_1.recal.fastq.gz from the working directory, not from /home/XXX/nearline.

Categories