Upload large file (>100 MB) directly to github with pygithub - python

I am using pyGitHub to upload files to my repo, however some of the files are so large that the server connection times out. My code to upload/overwrite a file from a folder is:
def commit(folder):
foldername = folder.split("/")[-1]
onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
repo = g.get_repo(user.login+"/My-repo")
all_files = []
contents = repo.get_contents("")
while contents:
file_content = contents.pop(0)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))
else:
file = file_content
all_files.append(str(file).replace('ContentFile(path="','').replace('")',''))
body = '''
Line 1: Message
Line 2: Sample Text
Line 3: yet another line
'''
for i in onlyfiles:
print(i)
input_file = open(folder + "/" + i, "rb")
data = input_file.read()
input_file.close()
if not(f"{foldername}/{i}" in all_files):
repo.create_file(f"{foldername}/{i}", "Created building data", data)
else:
file = repo.get_contents(f"{foldername}/{i}")
repo.update_file(file.path, "Updated information", data, file.sha)
This code works for files <25mb, but for larger ones I get the error:
---------------------------------------------------------------------------
GithubException Traceback (most recent call last)
<ipython-input-9-7d41473c81a0> in <module>()
79
80
---> 81 commit(str("/content/"+dirname))
3 frames
<ipython-input-9-7d41473c81a0> in commit(folder)
72 input_file.close()
73 if not(f"{foldername}/{i}" in all_files):
---> 74 repo.create_file(f"{foldername}/{i}", "Created building data", data)
75 else:
76 file = repo.get_contents(f"{foldername}/{i}")
/usr/local/lib/python3.7/dist-packages/github/Repository.py in create_file(self, path, message, content, branch, committer, author)
2091 "PUT",
2092 f"{self.url}/contents/{urllib.parse.quote(path)}",
-> 2093 input=put_parameters,
2094 )
2095
/usr/local/lib/python3.7/dist-packages/github/Requester.py in requestJsonAndCheck(self, verb, url, parameters, headers, input)
353 return self.__check(
354 *self.requestJson(
--> 355 verb, url, parameters, headers, input, self.__customConnection(url)
356 )
357 )
/usr/local/lib/python3.7/dist-packages/github/Requester.py in __check(self, status, responseHeaders, output)
376 output = self.__structuredFromJson(output)
377 if status >= 400:
--> 378 raise self.__createException(status, responseHeaders, output)
379 return responseHeaders, output
380
GithubException: 502 {"message": "Server Error"}
I am aware that the file upload limit for github is 25MB, but apparently files up to 100MB can be uploaded via the command line. How would I upload files larger than this to GitHub using pyGitHub? The file is zipped, so it really is as small as it can be, but is still ~150MB. Is this doable? If not, is there a way to reference a larger file in github which I can upload elsewhere? I am using Google Colab in case anyone is wondering.

Related

For loop crashing on speech_recognition

Python Newbie trying to teach myself how to use Python to run speech_recognition, and Im not having great luck.
The code below runs once and correctly converts a wav file to text, but then it crashes before running the remaining 2 wav files in my S3 bucket. The files are absolutely there:
OSR_us_000_0010_8k.wav
OSR_us_000_0011_8k.wav
OSR_us_000_0012_8k.wav
I could use some help fixing it.
Thanks in Advance.
import boto3
import speech_recognition as sr
r = sr.Recognizer()
session = boto3.client('s3',
aws_access_key_id= XXXX,
aws_secret_access_key=XXXX,
region_name='XXXX')
my_bucket = s3.Bucket(mys3bucket)
for my_bucket_object in my_bucket.objects.all():
with sr.AudioFile(my_bucket_object.key) as source:
print(my_bucket_object.key)
audio_data = r.record(source)
text = r.recognize_google(audio_data)
print(text)
OSR_us_000_0010_8k.wav<br>
Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of a well these day the chicken leg of a variegated rice is often served in roundels the juice of lemons mix find the boxes on the side the pump truck the ha grimstead topcon and garbage for hours of Citi workspace a large-sized and stockings in the hearts of cell
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-4-385959f26678> in <module>
14
15 for my_bucket_object in my_bucket.objects.all():
---> 16 with sr.AudioFile(my_bucket_object.key) as source:
17 print(my_bucket_object.key)
18 audio_data = r.record(source)
~/anaconda3/envs/mxnet_latest_p37/lib/python3.7/site-packages/speech_recognition/__init__.py in __enter__(self)
201 try:
202 # attempt to read the file as WAV
--> 203 self.audio_reader = wave.open(self.filename_or_fileobject, "rb")
204 self.little_endian = True # RIFF WAV is a little-endian format (most ``audioop`` operations assume that the frames are stored in little-endian form)
205 except (wave.Error, EOFError):
~/anaconda3/envs/mxnet_latest_p37/lib/python3.7/wave.py in open(f, mode)
508 mode = 'rb'
509 if mode in ('r', 'rb'):
--> 510 return Wave_read(f)
511 elif mode in ('w', 'wb'):
512 return Wave_write(f)
~/anaconda3/envs/mxnet_latest_p37/lib/python3.7/wave.py in __init__(self, f)
158 self._i_opened_the_file = None
159 if isinstance(f, str):
--> 160 f = builtins.open(f, 'rb')
161 self._i_opened_the_file = f
162 # else, assume it is an open file object already
FileNotFoundError: [Errno 2] No such file or directory: 'OSR_us_000_0011_8k.wav'

Error trying to use AudioSegment for .wav files

I'm trying to iterate through all the .wav files in a folder "audios", but I receive the following error. I found similar questions that were solved by installing ffmpeg, but that didn't help.
FileNotFoundError Traceback (most recent call last)
<ipython-input-24-29ba732186ac> in <module>
1 for audio_file in os.listdir(base_path+"audios"):
2 # read wav audio file
----> 3 audio = AudioSegment.from_wav(audio_file)
4
5 # pass audio file, start time, end time & chunk path to create chunk
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\audio_segment.py in from_wav(cls, file, parameters)
806 #classmethod
807 def from_wav(cls, file, parameters=None):
--> 808 return cls.from_file(file, 'wav', parameters=parameters)
809
810 #classmethod
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\audio_segment.py in from_file(cls, file, format, codec, parameters, start_second, duration, **kwargs)
649 except TypeError:
650 filename = None
--> 651 file, close_file = _fd_or_path_or_tempfile(file, 'rb', tempfile=False)
652
653 if format:
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\utils.py in _fd_or_path_or_tempfile(fd, mode, tempfile)
58
59 if isinstance(fd, basestring):
---> 60 fd = open(fd, mode=mode)
61 close_fd = True
62
FileNotFoundError: [Errno 2] No such file or directory: 'name_of_file.wav'
os.listdir doesn't return the full paths of files in the directory you give it, just the names they have within that directory. You will need to prepend this directory name to the filename you pass to AudioSegment.from_wav.
Try replacing the line
audio = AudioSegment.from_wav(audio_file)
with
audio = AudioSegment.from_wav(os.path.join(base_path+"audios", audio_file))

Why there is a error message Exception: This file is already closed

1.I was trying to write a python code to get all contents of files in each subfolder and create a index for each content (file contents). All the contents for each file can be get successfully. However, when I run the code, it always shows an error message Exception: This file is already closed.
2.Here is the code for building an index for each content, could someone explain to me why this thing could happened?
The traceback:
python-input-49-38a47b2f8c0c> in <module>
39 print(searcher)
40
---> 41 writers.commit(optimize=True)
42
43 # from whoosh.query import *
~/.local/lib/python3.8/site-packages/whoosh/writing.py in commit(self, mergetype, optimize, merge)
928 else:
929 # Close segment files
--> 930 self._close_segment()
931 # Write TOC
932 self._commit_toc(finalsegments)
~/.local/lib/python3.8/site-packages/whoosh/writing.py in _close_segment(self)
841 def _close_segment(self):
842 if not self.perdocwriter.is_closed:
--> 843 self.perdocwriter.close()
844 if not self.fieldwriter.is_closed:
845 self.fieldwriter.close()
~/.local/lib/python3.8/site-packages/whoosh/codec/whoosh3.py in close(self)
265 for writer in self._colwriters.values():
266 writer.finish(self._doccount)
--> 267 self._cols.save_as_files(self._storage, self._column_filename)
268
269 # If vectors were written, close the vector writers
~/.local/lib/python3.8/site-packages/whoosh/filedb/compound.py in save_as_files(self, storage, name_fn)
295
296 def save_as_files(self, storage, name_fn):
--> 297 for name, blocks in self._readback():
298 f = storage.create_file(name_fn(name))
299 for block in blocks():
~/.local/lib/python3.8/site-packages/whoosh/filedb/compound.py in _readback(self)
276
277 yield (name, gen)
--> 278 temp.close()
279 self._tempstorage.delete_file(self._tempname)
280
~/.local/lib/python3.8/site-packages/whoosh/filedb/structfile.py in close(self)
121
122 if self.is_closed:
--> 123 raise Exception("This file is already closed")
124 if self.onclose:
125 self.onclose(self)
Exception: This file is already closed
import os
import codecs
import whoosh
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT,textdata=TEXT(stored=True))
ix = create_in("folder", schema)
filelist = []
for root, dirs, files in os.walk("./test_result"):
for file in files:
#append the file name to the list
filelist.append(os.path.join(root,file))
#print all the file names
writer = ix.writer()
i = 0
for name in filelist:
i = i +1
with codecs.open (name, "r",encoding='utf-8',
errors='ignore') as myfile:
text=myfile.read()
# print ("adding document "+name)
writer.add_document(title="document "+name, path="folder",content=text,textdata=text)
myfile.close()
print(text)
searcher = ix.searcher()
print(searcher)
writers.commit(optimize=True)
with statement handles resources management, including file closing. You could read more about it here.
This code:
f = open(file)
f.write("blablabla")
f.close
is equivalent to this:
with open(file) as f
f.write("blablabla")
This exception is a result of you trying to close a file that is already closed implicitly by with statement.
You only need to delete this line:
myfile.close()
EDIT:
I just explained the error in the code, but didn't notice the update in the comments. Please update the question itself with the mentioned line deleted.
On a side note, I see you used writers.commit() instead of writer.commit(), please make sure it's not a typo and update your question if your code still doesn't work.

WatsonApiException: Error: The Preview API was removed on 2019-09-30

I am trying to access the IBM Watson Discovery API (Free Trial) using the piece of code below:
with open(filename, "r") as f:
res = discovery.test_configuration_in_environment(environment_id=env_id, configuration_id=cfg_id, file=f).get_result()
You can view the full code file here: https://github.com/udacity/AIND-NLP-Bookworm/blob/master/bookworm.ipynb. I am getting the following error on running this:
---------------------------------------------------------------------------
WatsonApiException Traceback (most recent call last)
<ipython-input-10-17e98c795a32> in <module>()
3 filename = os.path.join(data_dir, "sample.html")
4 with open(filename, "r") as f:
----> 5 res = discovery.test_configuration_in_environment(environment_id=env_id, configuration_id=cfg_id, file=f).get_result()
6 print(json.dumps(res, indent=2))
/opt/conda/lib/python3.6/site-packages/watson_developer_cloud/discovery_v1.py in test_configuration_in_environment(self, environment_id, configuration, step, configuration_id, file, metadata, file_content_type, filename, **kwargs)
702 params=params,
703 files=form_data,
--> 704 accept_json=True)
705 return response
706
/opt/conda/lib/python3.6/site-packages/watson_developer_cloud/watson_service.py in request(self, method, url, accept_json, headers, params, json, data, files, **kwargs)
585 error_info = self._get_error_info(response)
586 raise WatsonApiException(response.status_code, error_message,
--> 587 info=error_info, httpResponse=response)
WatsonApiException: Error: The Preview API was removed on 2019-09-30., Code: 410 , X-dp-watson-tran-id: 569fbd407a75c23f850522571bddee26 , X-global-transaction-id: 569fbd407a75c23f850522571bddee26
Any known workarounds?
As per the release notes and the response you are seeing the Preview API was deprecated on the 4th June 2019 - https://cloud.ibm.com/docs/discovery?topic=discovery-release-notes#4jun19 - and removed on the 30 Sept 2019 - https://cloud.ibm.com/docs/discovery?topic=discovery-release-notes
The Discovery API is still available - https://cloud.ibm.com/apidocs/discovery/discovery , just not the preview method.
What is it that you are trying to do?

To unzip a file

I want to unzip a file of type *.sec.gz which is a zipfile. But i'm getting badfile.....Can someone guide to resolve this.....File present in the folder is of type *.sec ........Thanks in advance
import zipfile
def unzip(path):
zfile = zipfile.ZipFile(path)
for name in zfile.namelist():
(dirname, filename) = os.path.split(name)
if filename == '':
# directory
if not os.path.exists(dirname):
os.mkdir(dirname)
else:
# file
fd = open(name, 'w')
fd.write(zfile.read(name))
fd.close()
zfile.close()
k=unzip('C://test//08October2014//DATA_INTV_NEW//Oct0814//1.sec.gz')
Output:
BadZipfile Traceback (most recent call last)
<ipython-input-7-5134b63e752e> in <module>()
27 zfile.close()
28
---> 29 k=unzip('C://test//08October2014//DATA_INTV_NEW//Oct0814//1.sec.gz')
<ipython-input-7-5134b63e752e> in unzip(path)
13
14 def unzip(path):
---> 15 zfile = zipfile.ZipFile(path)
16 for name in zfile.namelist():
17 (dirname, filename) = os.path.split(name)
C:\Python27\Lib\zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
768 try:
769 if key == 'r':
--> 770 self._RealGetContents()
771 elif key == 'w':
772 # set the modified flag so central directory gets written
C:\Python27\Lib\zipfile.pyc in _RealGetContents(self)
809 raise BadZipfile("File is not a zip file")
810 if not endrec:
--> 811 raise BadZipfile, "File is not a zip file"
812 if self.debug > 1:
813 print endrec
BadZipfile: File is not a zip file
The error message is completely accurate: that is not a zip file. It is a gzip file, which is something completely different. You should use the gzip module.

Categories