Pandas not reading tables from html files in folder - python

I am trying to read the tables of each individual html file in a folder using pandas, to find out the number of tables in each file.
However, this feature works when specifying a single file, but when I try to run it in the folder it says there are no tables.
This is the code for the single file
import pandas as pd
file = r'C:\Users\Ahmed_Abdelmuniem\Desktop\XXX.html'
table = pd.read_html(file)
print ('tables found:', len(table))
This is the output
C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\python.exe C:/Users/Ahmed_Abdelmuniem/PycharmProjects/PandaHTML/main.py
tables found: 72
Process finished with exit code 0
This is the code for each file in a folder
import pandas as pd
import shutil
import os
source_dir = r'C:\Users\Ahmed_Abdelmuniem\Desktop\TMorning'
target_dir = r'C:\Users\Ahmed_Abdelmuniem\Desktop\TAfternoon'
file_names = os.listdir(source_dir)
for file_name in file_names:
table = pd.read_html(file_name)
print ('tables found:', len(table))
This is the error log:
C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\python.exe "C:/Users/Ahmed_Abdelmuniem/PycharmProjects/File mover V2.0/main.py"
Traceback (most recent call last):
File "C:\Users\Ahmed_Abdelmuniem\PycharmProjects\File mover V2.0\main.py", line 12, in <module>
table = pd.read_html(file_name)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 1085, in read_html
return _parse(
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 913, in _parse
raise retained
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 893, in _parse
tables = p.parse_tables()
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 213, in parse_tables
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 543, in _parse_tables
raise ValueError("No tables found")
ValueError: No tables found
Process finished with exit code 1

os.listdir returns a list containing the names of the entries in the directory including subdirectories or any other files. If you want to keep only html files, prefer use glob.glob.
import glob
file_names = glob.glob(os.path.join(source_dir, '*.html'))
Edit: if you want to use os.listdir, you have to get the actual path to the file:
for file_name in file_names:
table = pd.read_html(os.path.join(source_dir, file_name))
print ('tables found:', len(table))

Related

How to read all parquet files from a s3 bucket

I currently have an s3 bucket that has folders with parquet files inside. I want to read all the individual parquet files and concatenate them into a pandas dataframe regardless of the folder they are in.
I am trying the following code:
import pyarrow.parquet as pq
import s3fs
s3 = s3fs.S3FileSystem()
pandas_dataframe = pq.ParquetDataset('s3://vivienda-test/2022/11', filesystem=s3).read_pandas().to_pandas()
print(pandas_dataframe)
I realize that it only works for concatenation the parquets of a specific folder of the bucket and it also gives me the following error:
Traceback (most recent call last):
File "/Users/Documents/inf.py", line 5, in <module>
pandas_dataframe = pq.ParquetDataset('s3://vivienda-test/2022/11', filesystem=s3).read_pandas().to_pandas()
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 1790, in __init__
self.validate_schemas()
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 1824, in validate_schemas
self._schema = self._pieces[0].get_metadata().schema
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 1130, in get_metadata
f = self.open()
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 1137, in open
reader = self.open_file_func(self.path)
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 1521, in _open_dataset_file
return ParquetFile(
File "/usr/local/lib/python3.10/site-packages/pyarrow/parquet/__init__.py", line 286, in __init__
self.reader.open(
File "pyarrow/_parquet.pyx", line 1227, in pyarrow._parquet.ParquetReader.open
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Parquet file size is 0 bytes
can someone help me?, thanks
You can use the aws wrangler api's to achieve the same.
https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.s3.read_parquet.html
#Reading all Parquet files under a prefix
import awswrangler as wr
df = wr.s3.read_parquet(path='s3://bucket/prefix/')

Merge PDFs from python list of WindowsPath paths

I have an excel file with some data in rows and columns and I am targetting at take the file names from each row and merge those into one pdf file (simply each row to one pdf file)
This is an example of a list ['1', '112238', '112239', '112240', '112337', '112338'] the first element in the python list will be the pdf name and the other elements are the file names that is supposed to be existing in directory named Files
Here's my attempt till now
from pathlib import Path
import pandas as pd
from PyPDF2 import PdfFileMerger
BASE_DIR = Path.cwd()
MAIN_DIR = BASE_DIR / 'Files'
FINAL_DIR = BASE_DIR / 'Final'
try:
shutil.rmtree(FINAL_DIR)
except:
pass
FINAL_DIR.mkdir(parents=True, exist_ok=True)
df = pd.read_excel('MainFile.xlsx', dtype = str)
for l in df.T.columns:
new_list = list(df.T[l][df.T[l].notna()])
files_list = [MAIN_DIR / f'{i}.pdf' for i in new_list[1:]]
final_list = []
final_list.append(new_list[0])
for file in files_list:
if file.exists():
final_list.append(file)
else:
print(f'{file} ---> NOT Exists')
merger = PdfFileMerger()
for pdf in final_list[1:]:
merger.append(pdf)
merger.write(FINAL_DIR / f'{final_list[0]}.pdf')
merger.close()
Here's a snapshot of the excel file that I read the file names from
and the pdf files in directory named Files
When I tried to run the script, I encountered an error like that
Traceback (most recent call last):
File "C:\Users\Future\Desktop\demo.py", line 33, in <module>
merger.append(pdf)
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\merger.py", line 203, in append
self.merge(len(self.pages), fileobj, bookmark, pages, import_bookmarks)
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\merger.py", line 133, in merge
pdfr = PdfFileReader(fileobj, strict=self.strict)
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\pdf.py", line 1689, in read
stream.seek(-1, 2)
AttributeError: 'WindowsPath' object has no attribute 'seek'
I have tried this modification merger.append(str(Path(pdf))) and it seems to skip the first problem (I am not sure) but now I got another error
PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]
Traceback (most recent call last):
File "C:\Users\Future\Desktop\demo.py", line 39, in <module>
merger.write(FINAL_DIR / f'{str(final_list[0])}.pdf')
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\merger.py", line 230, in write
self.output.write(fileobj)
File "C:\Users\Future\AppData\Local\Programs\Python\Python39\lib\site-packages\PyPDF2\pdf.py", line 487, in write
stream.write(self._header + b_("\n"))
AttributeError: 'WindowsPath' object has no attribute 'write'
Solved by these two modifications
merger.append(str(Path(pdf)))
and I used os to join the path as I failed to use Path
merger.write(os.path.join(str(FINAL_DIR), str(final_list[0] + '.pdf')))

How can I process only which files have been copied completely?

I am writing a Python script in Windows. I'm processed Zip File from the folder if the file is completed copied and start the program it is working fine, but I encounter with Problem when the program is running and start Copy file in the folder. It's giving error and closed.
How can I process only which files have been copied completely?
Or check if the file size is increasing it does not process in 30 seconds interval if file size does not increase then it processed?
My code:
import zipfile
import os
import xml.etree.ElementTree as ET
import shutil
import configparser
import time
#function used to read zip and xml
def read_zipfile(file):
with zipfile.ZipFile(file,'r') as zf:
for name in zf.namelist():
if name.endswith('.xml'):
#open Zip and read Xml
xml_content=zf.open(name)
# here you do your magic with [f] : parsing, etc
return xml_content
#Function use to parsing XML file
def XMl_Parsa(f):
tree=ET.parse(f)
root=tree.getroot()
# attribute are iterated attribute to get value of tag;
for node in tree.iter('attribute'):
if(node.attrib['name']=='ProductNameCont'):
zone = str(node.text)
return zone
#fucnction used to move file
def move_zipFile(zone,out_put,in_xml,file):
#defing destination path
Product_zone=(os.path.join(out_put,zone))
print(Product_zone)
#Moveing fine for base folder to Product folder
try:
os.makedirs(Product_zone,exist_ok=True)
print("Directory '%s' created successfully" % zone)
except OSError as error:
print("Directory '%s' Exist " % error)
try:
#unziping zip file
shutil.unpack_archive(os.path.join(in_xml, file),os.path.join(Product_zone,os.path.splitext(file)[0]))
os.remove(os.path.join(in_xml, file))
print("File '%s' moved to successfully" % file)
except OSError as error:
print("File '%s' Exist " % error)
#Function use for read Config File
def config_read():
config = configparser.ConfigParser()
config.read('./Config.ini')
xml_path = config.get('Path', 'xml_path')
dest = config.get('Path', 'dest')
return xml_path,dest
def main():
in_xml=config_read()[0]
out_put=config_read()[1]
for file in os.listdir(in_xml):
move_zipFile(XMl_Parsa(read_zipfile(os.path.join(in_xml, file))),out_put,in_xml,file)
if __name__=="__main__":
while 1:
time.sleep(10)
main()
Error
Traceback (most recent call last):
File "Clero_zipFile_transfer - Copy.py", line 65, in <module>
File "Clero_zipFile_transfer - Copy.py", line 60, in main
File "Clero_zipFile_transfer - Copy.py", line 9, in read_zipfile
File "zipfile.py", line 1268, in __init__
File "zipfile.py", line 1335, in _RealGetContents
zipfile.BadZipFile: File is not a zip file
[2916] Failed to execute script Clero_zipFile_transfer - Copy

Python - Parsing all XML files in a folder to CSV files

I just started learning python so this might be a very basic question but here's where I'm stuck.
I'm trying to parse ALL XML files in a given folder and outputting CSV files, with the same filename as the original XML files. I've tested with single files and it works perfectly but the issue I'm having is with performing the same for all of them and having that running on a loop as it would be a perpetual script.
Here my code:
import os
import xml.etree.cElementTree as Eltree
import pandas as pd
path = r'C:/python_test'
filenames = []
for filename in os.listdir(path):
if not filename.endswith('.xml'): continue
fullname = os.path.join(path, filename)
print(fullname)
filenames.append(fullname)
cols = ["serviceName", "startDate", "endDate"]
rows = []
for filename in filenames:
xmlparse = Eltree.parse(filename)
root = xmlparse.getroot()
csvoutput=[]
for fixed in root.iter('{http://www.w3.org/2001/XMLSchema}channel'):
channel = fixed.find("channelName").text
for dyn in root.iter('programInformation'):
start = dyn.find("publishedStartTime").text
end = dyn.find("endTime").text
rows.append({"serviceName": channel, "startDate": start, "endDate": end})
df = pd.DataFrame(rows, columns=cols)
df.to_csv(csvoutput)
This is the error I'm getting:
C:/python_test\1.xml
C:/python_test\2.xml
C:/python_test\3.xml
C:/python_test\4.xml
C:/python_test\5.xml
C:/python_test\6.xml
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<string>", line 49, in <module>
File "C:\Users\ragehol\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\generic.py", line 3466, in to_csv
return DataFrameRenderer(formatter).to_csv(
File "C:\Users\ragehol\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\formats\format.py", line 1105, in to_csv
csv_formatter.save()
File "C:\Users\ragehol\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\formats\csvs.py", line 237, in save
with get_handle(
File "C:\Users\ragehol\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\common.py", line 609, in get_handle
ioargs = _get_filepath_or_buffer(
File "C:\Users\ragehol\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\common.py", line 396, in _get_filepath_or_buffer
raise ValueError(msg)
ValueError: Invalid file path or buffer object type: <class 'list'>
Any kind of suggestions would be greatly appreciated!
Many thanks!
This is you bug:
csvoutput=[] is defined as a list. Later on you pass it as argument to df.to_csv(csvoutput). So you are passing a list to a method that looks for a file path.

No text parsed from document

I have written an html text parser, when I use it in a large batch of files i.e. 5,000 or more, it randomly produces this error, when I re-run it it produces the same error in the exact same files. So I removed those files and parsed them individually and the parser read them.
So I created a new folder with the "Problematic" files and tried parsing them separately, it produced no error for most then it re-produced the same error again.
This is the code
import pandas as pd
import shutil
import os
import glob
source_file = r'C:/Users/Ahmed_Abdelmuniem/Desktop/Mar/Problematic/'
file_names = glob.glob(os.path.join(source_file,"*.html"))
for file_name in file_names:
table = pd.read_html(file_name)
print (table)
This is the error:
Traceback (most recent call last):
File "C:\Users\Ahmed_Abdelmuniem\PycharmProjects\No Text Parsed Troubleshooting\main.py", line 11, in <module>
table = pd.read_html(file_name)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 1085, in read_html
return _parse(
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 893, in _parse
tables = p.parse_tables()
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 213, in parse_tables
tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
File "C:\Users\Ahmed_Abdelmuniem\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\html.py", line 735, in _build_doc
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
File "<string>", line 0
lxml.etree.XMLSyntaxError: no text parsed from document
Process finished with exit code 1
I took the "unreadable" files outside of the folder and parsed them individually and the code read them, I can't seem to identify what is wrong.
I hope my explanation is clear and sufficient.
There is a hidden .DS_STORE file in the folder. This is my code:
from lxml import etree
import pandas as pd
import os
from time import sleep
locations_folder = '/Users/jerryhu/Documents/Documents/Zenly/locations'
failed_files = []
def parse(file):
tables = pd.read_html(file)
dataframe = pd.DataFrame(tables[0])
path, name = os.path.split(file)
with open(f'/Users/jerryhu/Documents/Documents/Zenly/locations_csv/{name}'.replace('.html', '.csv'), 'w') as writeCSV:
dataframe.to_csv(writeCSV)
print(f"Writing {name.replace('.html', '.csv')} to disk")
try:
failed_files.remove(file)
except:
pass
for filename in os.listdir(locations_folder):
file = os.path.join(locations_folder, filename)
if os.path.exists(file):
try:
parse(file)
except:
failed_files.append(file)
print("\nFinished. These files failed to parse:")
for i in failed_files:
print(i)
print("Retrying in 3 seconds.")
sleep(3)
for i in failed_files:
try:
parse(i)
except:
print(f'{i} couldn\'t be parsed.')
This is the error returned:
Writing 2022-10-09.csv to disk
Writing 2022-09-13.csv to disk
Writing 2022-09-05.csv to disk
Writing 2022-08-28.csv to disk
Writing 2022-12-22.csv to disk
Writing 2022-08-08.csv to disk
Writing 2023-01-01.csv to disk
Writing 2022-09-25.csv to disk
Writing 2022-12-02.csv to disk
Writing 2022-11-12.csv to disk
Writing 2022-12-14.csv to disk
Writing 2022-10-29.csv to disk
Writing 2022-11-04.csv to disk
Writing 2022-10-05.csv to disk
Writing 2022-11-28.csv to disk
Writing 2022-08-24.csv to disk
Writing 2022-07-17.csv to disk
Writing 2022-09-09.csv to disk
Writing 2022-10-13.csv to disk
Finished. These files failed to parse:
/Users/jerryhu/Documents/Documents/Zenly/locations/.DS_Store
Retrying in 3 seconds.
/Users/jerryhu/Documents/Documents/Zenly/locations/.DS_Store couldn't be parsed.
Just put a try and except block to skip the DS_Store file.

Categories