To unzip a file

To unzip a file - python

I want to unzip a file of type *.sec.gz which is a zipfile. But i'm getting badfile.....Can someone guide to resolve this.....File present in the folder is of type *.sec ........Thanks in advance
import zipfile
def unzip(path):
zfile = zipfile.ZipFile(path)
for name in zfile.namelist():
(dirname, filename) = os.path.split(name)
if filename == '':
# directory
if not os.path.exists(dirname):
os.mkdir(dirname)
else:
# file
fd = open(name, 'w')
fd.write(zfile.read(name))
fd.close()
zfile.close()
k=unzip('C://test//08October2014//DATA_INTV_NEW//Oct0814//1.sec.gz')
Output:
BadZipfile Traceback (most recent call last)
<ipython-input-7-5134b63e752e> in <module>()
27 zfile.close()
28
---> 29 k=unzip('C://test//08October2014//DATA_INTV_NEW//Oct0814//1.sec.gz')
<ipython-input-7-5134b63e752e> in unzip(path)
13
14 def unzip(path):
---> 15 zfile = zipfile.ZipFile(path)
16 for name in zfile.namelist():
17 (dirname, filename) = os.path.split(name)
C:\Python27\Lib\zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
768 try:
769 if key == 'r':
--> 770 self._RealGetContents()
771 elif key == 'w':
772 # set the modified flag so central directory gets written
C:\Python27\Lib\zipfile.pyc in _RealGetContents(self)
809 raise BadZipfile("File is not a zip file")
810 if not endrec:
--> 811 raise BadZipfile, "File is not a zip file"
812 if self.debug > 1:
813 print endrec
BadZipfile: File is not a zip file

The error message is completely accurate: that is not a zip file. It is a gzip file, which is something completely different. You should use the gzip module.

Related

Upload large file (>100 MB) directly to github with pygithub

I am using pyGitHub to upload files to my repo, however some of the files are so large that the server connection times out. My code to upload/overwrite a file from a folder is:
def commit(folder):
foldername = folder.split("/")[-1]
onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
repo = g.get_repo(user.login+"/My-repo")
all_files = []
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
else:
file = file_content
all_files.append(str(file).replace('ContentFile(path="','').replace('")',''))
body = '''
Line 1: Message
Line 2: Sample Text
Line 3: yet another line
'''
for i in onlyfiles:
print(i)
input_file = open(folder + "/" + i, "rb")
data = input_file.read()
input_file.close()
if not(f"{foldername}/{i}" in all_files):
repo.create_file(f"{foldername}/{i}", "Created building data", data)
else:
file = repo.get_contents(f"{foldername}/{i}")
repo.update_file(file.path, "Updated information", data, file.sha)
This code works for files <25mb, but for larger ones I get the error:
---------------------------------------------------------------------------
GithubException Traceback (most recent call last)
<ipython-input-9-7d41473c81a0> in <module>()
79
80
---> 81 commit(str("/content/"+dirname))
3 frames
<ipython-input-9-7d41473c81a0> in commit(folder)
72 input_file.close()
73 if not(f"{foldername}/{i}" in all_files):
---> 74 repo.create_file(f"{foldername}/{i}", "Created building data", data)
75 else:
76 file = repo.get_contents(f"{foldername}/{i}")
/usr/local/lib/python3.7/dist-packages/github/Repository.py in create_file(self, path, message, content, branch, committer, author)
2091 "PUT",
2092 f"{self.url}/contents/{urllib.parse.quote(path)}",
-> 2093 input=put_parameters,
2094 )
2095
/usr/local/lib/python3.7/dist-packages/github/Requester.py in requestJsonAndCheck(self, verb, url, parameters, headers, input)
353 return self.__check(
354 *self.requestJson(
--> 355 verb, url, parameters, headers, input, self.__customConnection(url)
356 )
357 )
/usr/local/lib/python3.7/dist-packages/github/Requester.py in __check(self, status, responseHeaders, output)
376 output = self.__structuredFromJson(output)
377 if status >= 400:
--> 378 raise self.__createException(status, responseHeaders, output)
379 return responseHeaders, output
380
GithubException: 502 {"message": "Server Error"}
I am aware that the file upload limit for github is 25MB, but apparently files up to 100MB can be uploaded via the command line. How would I upload files larger than this to GitHub using pyGitHub? The file is zipped, so it really is as small as it can be, but is still ~150MB. Is this doable? If not, is there a way to reference a larger file in github which I can upload elsewhere? I am using Google Colab in case anyone is wondering.

How to solve BadZipFile: File is not a zip file error in Jupyter Notebook?

I'm trying to read .xlsx files from folders in a specific directory and to write/export them into 4 new .xlsx files, which every new .xlsx will be containing data catalog per year.
The script works well when i tried it some months ago but it's not working anymore. It keeps on resulting BadZipFile: File is not a zip file error.
Do I miss something? I've tried upgrading and downgrading the anaconda, python, openpyxl, and pandas version but it doesn't help.
from openpyxl import load_workbook
import pandas as pd
import os
import re
path_folder = r'C:\\Users\\lala\\Downloads\\New folder\\Data Klimatologi\\'
folder_tahun = os.listdir(path_folder)
year_folder
for x in year_folder:
year_folder = os.listdir(path_folder + x)
frames = []
for y in station_folder:
path_file = path_folder + '{}\\{}'.format(x,y)
files = os.listdir(path_file)
for z in files:
pattern = path_folder + '{}\\{}\\{}'.format(x,y,z)
wb = load_workbook(filename = pattern)
sheet = wb.active#has 1 sheet
max_row_for_Tn = max((b.row for b in sheet['B'] if b.value is not None))
cell = 'A9:K%d' % (max_row_for_Tn)
data = sheet[cell]
row_list = []
for row in data:
cols = []
for col in row:
cols.append(col.value)
row_list.append(cols)
df = pd.DataFrame(data = row_list[1:], index=None, columns=row_list[0])
cell_id = sheet.cell(row = 1, column = 3)
pk = cell_id.value
pk = re.sub('[\s]+', '', pk)
pk = int(re.sub(r'[^.,a-zA-Z0-9 \n\.]','', pk))
df['Id WMO'] = pk
frames.append(df)
result = pd.concat(frames)
result.to_excel(r'C:\Users\lala\OneDrive\Documents\Dataset\Dataset Stasiun BMKG Tahun {}.xlsx'.format(x), index = False)
The script works well until year_folder giving the output ('2000','2001','2002','2003','2004').
Here's the traceback.
---------------------------------------------------------------------------
BadZipFile Traceback (most recent call last)
<ipython-input-4-e8e2d94d1368> in <module>
7 for z in files:
8 pattern = path_folder + '{}\\{}\\{}'.format(x,y,z)
----> 9 wb = load_workbook(filename = pattern)
10 sheet = wb.active#has 1 sheet
11 max_row_for_Tn = max((b.row for b in sheet['B'] if b.value is not None))
~\anaconda3\envs\Pandas\lib\site-packages\openpyxl\reader\excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
312 """
313 reader = ExcelReader(filename, read_only, keep_vba,
--> 314 data_only, keep_links)
315 reader.read()
316 return reader.wb
~\anaconda3\envs\Pandas\lib\site-packages\openpyxl\reader\excel.py in __init__(self, fn, read_only, keep_vba, data_only, keep_links)
122 def __init__(self, fn, read_only=False, keep_vba=KEEP_VBA,
123 data_only=False, keep_links=True):
--> 124 self.archive = _validate_archive(fn)
125 self.valid_files = self.archive.namelist()
126 self.read_only = read_only
~\anaconda3\envs\Pandas\lib\site-packages\openpyxl\reader\excel.py in _validate_archive(filename)
94 raise InvalidFileException(msg)
95
---> 96 archive = ZipFile(filename, 'r')
97 return archive
98
~\anaconda3\envs\Pandas\lib\zipfile.py in __init__(self, file, mode, compression, allowZip64)
1129 try:
1130 if mode == 'r':
-> 1131 self._RealGetContents()
1132 elif mode in ('w', 'x'):
1133 # set the modified flag so central directory gets written
~\anaconda3\envs\Pandas\lib\zipfile.py in _RealGetContents(self)
1196 raise BadZipFile("File is not a zip file")
1197 if not endrec:
-> 1198 raise BadZipFile("File is not a zip file")
1199 if self.debug > 1:
1200 print(endrec)
BadZipFile: File is not a zip file

The error message is exactly correct. Current versions of Excel use the .xlsx format, which are zip files containing a small directory tree. That format was not introduced until Excel 2007. Assuming those files really are from 2001, 2002, etc., they are in the old-style Excel .xls format, which is not a zip file. pandas does not know how to import .xls files. You may need to find a separate module to convert them.

It turned out one of the .xlsx file was duplicated. I deleted the duplicated file and the error isn't showing up anymore.
If anyone find the same error, you can check them separately if there's corrupted/doubled files in your directory.
It won't be a problem to use the same file name as long as the files contain different values in it.

Error trying to use AudioSegment for .wav files

I'm trying to iterate through all the .wav files in a folder "audios", but I receive the following error. I found similar questions that were solved by installing ffmpeg, but that didn't help.
FileNotFoundError Traceback (most recent call last)
<ipython-input-24-29ba732186ac> in <module>
1 for audio_file in os.listdir(base_path+"audios"):
2 # read wav audio file
----> 3 audio = AudioSegment.from_wav(audio_file)
4
5 # pass audio file, start time, end time & chunk path to create chunk
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\audio_segment.py in from_wav(cls, file, parameters)
806 #classmethod
807 def from_wav(cls, file, parameters=None):
--> 808 return cls.from_file(file, 'wav', parameters=parameters)
809
810 #classmethod
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\audio_segment.py in from_file(cls, file, format, codec, parameters, start_second, duration, **kwargs)
649 except TypeError:
650 filename = None
--> 651 file, close_file = _fd_or_path_or_tempfile(file, 'rb', tempfile=False)
652
653 if format:
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\utils.py in _fd_or_path_or_tempfile(fd, mode, tempfile)
58
59 if isinstance(fd, basestring):
---> 60 fd = open(fd, mode=mode)
61 close_fd = True
62
FileNotFoundError: [Errno 2] No such file or directory: 'name_of_file.wav'

os.listdir doesn't return the full paths of files in the directory you give it, just the names they have within that directory. You will need to prepend this directory name to the filename you pass to AudioSegment.from_wav.
Try replacing the line
audio = AudioSegment.from_wav(audio_file)
with
audio = AudioSegment.from_wav(os.path.join(base_path+"audios", audio_file))

zipfile extractall raising "BadZipFile: Bad CRC-32 for file" error

This is the file I am trying to open
https://drive.google.com/file/d/1K2kDBTNXS2ikx9xKmi2Fy0Wsc5u_Lls0/view
It is described here
https://github.com/armancohan/long-summarization
After I added the file to my google drive, this is the code I am trying to use to open it.
from google.colab import drive
drive.mount('/content/gdrive')
import zipfile
zip_ref = zipfile.ZipFile('/content/gdrive/My Drive/arxiv-release.zip', 'r')
zip_ref.extractall('arxiv-release')
zip_ref.close()
This is the error that is raised
---------------------------------------------------------------------------
BadZipFile Traceback (most recent call last)
<ipython-input-9-9965160388a1> in <module>()
1
----> 2 zip_ref.extractall('arxiv-release')
3 zip_ref.close()
5 frames
/usr/lib/python3.6/zipfile.py in extractall(self, path, members, pwd)
1522
1523 for zipinfo in members:
-> 1524 self._extract_member(zipinfo, path, pwd)
1525
1526 #classmethod
/usr/lib/python3.6/zipfile.py in _extract_member(self, member, targetpath, pwd)
1577 with self.open(member, pwd=pwd) as source, \
1578 open(targetpath, "wb") as target:
-> 1579 shutil.copyfileobj(source, target)
1580
1581 return targetpath
/usr/lib/python3.6/shutil.py in copyfileobj(fsrc, fdst, length)
77 """copy data from file-like object fsrc to file-like object fdst"""
78 while 1:
---> 79 buf = fsrc.read(length)
80 if not buf:
81 break
/usr/lib/python3.6/zipfile.py in read(self, n)
870 self._offset = 0
871 while n > 0 and not self._eof:
--> 872 data = self._read1(n)
873 if n < len(data):
874 self._readbuffer = data
/usr/lib/python3.6/zipfile.py in _read1(self, n)
960 if self._left <= 0:
961 self._eof = True
--> 962 self._update_crc(data)
963 return data
964
/usr/lib/python3.6/zipfile.py in _update_crc(self, newdata)
888 # Check the CRC if we're at the end of the file
889 if self._eof and self._running_crc != self._expected_crc:
--> 890 raise BadZipFile("Bad CRC-32 for file %r" % self.name)
891
892 def read1(self, n):
BadZipFile: Bad CRC-32 for file 'arxiv-release/train.txt'

Iterate through all files in a directory and find and replace text - Python

Baby brand new. This was Frankenstein'ed together from a few similar topics, none of which seemed to cover the necessary step of nesting a find and replace inside a file loop.
I am attempting to iterate through every file in a folder (not recursively, I only have one folder level) of a specific type (listed here as a '.LIC') and replace a short bit of text. The following is as close as I could come:
import glob, os, fileinput
from glob import glob
root_dir = r"myPath"
os.chdir(root_dir)
for file in glob, glob('*.LIC'):
filename = str(file)
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace('findText', 'replaceText'), end='')
As you can imagine this went swimmingly. The error code is placed below.
OSError Traceback (most recent call last)
<ipython-input-61-e2fd0e9a5df9> in <module>()
6 filename = str(file)
7 with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
----> 8 for line in file:
9 print(line.replace('findText', 'replaceText'), end='')
10
C:\Users\Me\Anaconda3\lib\fileinput.py in __next__(self)
246 def __next__(self):
247 while True:
--> 248 line = self._readline()
249 if line:
250 self._filelineno += 1
C:\Users\Me\Anaconda3\lib\fileinput.py in _readline(self)
333 pass
334 # The next few lines may raise OSError
--> 335 os.rename(self._filename, self._backupfilename)
336 self._file = open(self._backupfilename, self._mode)
337 try:
OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: '<function glob at 0x00000000013D3400>' -> '<function glob at 0x00000000013D3400>.bak'
I think my problem is nesting a reference to 'file', but I am unsure how to resolve this.
Thank you for the help in advance.

You should loop over the result of glob and not a tuple with the function object glob:
for filename in glob('*.LIC'):
with fileinput.FileInput(filename, inplace=True, backup='.bak') as file:
for line in file:
print(line.replace('findText', 'replaceText'), end='')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

To unzip a file - python

The error message is completely accurate: that is not a zip file. It is a gzip file, which is something completely different. You should use the gzip module.

Related

Upload large file (>100 MB) directly to github with pygithub

How to solve BadZipFile: File is not a zip file error in Jupyter Notebook?

Error trying to use AudioSegment for .wav files

zipfile extractall raising "BadZipFile: Bad CRC-32 for file" error

Iterate through all files in a directory and find and replace text - Python

Categories

Resources