running data parser on multiple files in folder? Python

running data parser on multiple files in folder? Python - python

long time lurker, but never posted here. Sorry if this isn't a good post...I made a program that uses regex to pull the names and emails out of resumes. I can get it to open a specific file in my resume folder, but getting the program to iterate over all of the files in the folder has me stumped. Here's the pseudo-code for what I'm doing:
open resume folder
read file1.txt
execute nameFinder
execute emailFinder
create new dictionary candidateData
Export to Excel
read file2.txt
...
Here's the code:
import re
import os
import pprint
with open('John Doe -Resume.txt', 'r') as f:
#This pulls the first line of the resume,
#Which is generally the name.
first_line_name = f.readline().strip()
#This pulls the Email from the resume.
bulkemails = f.read()
r = re.compile(r'(\b[\w.]+#+[\w.]+.+[\w.]\b)')
candidateEmail = r.findall(bulkemails)
emails = ""
for x in candidateEmail:
emails += str(x)+"\n"
#This creates the dictionary data
candidateData = {'candidateEmail' : str(candidateEmail), \
'candidateName' : str(first_line_name)}
pprint.pprint(candidateData)
Then, I get this as an output:
{'candidateEmail': "['JohnDoe#gmail.com']",
'candidateName': 'John Doe'}
All ready to be exported into Excel.
SO HERE"S MY QUESTION FOR YOU! How do I get it to do this for ALL of the .txt files in my resume folder, and not just the file I specify? Also, any cod critique would be greatly appreciated, Thanks guys! :D

You can use glob to iterate over all .txt files in your directory and then run the function on each file. Add this to the start
import re
import os
import glob
import pprint
os.chdir("resumes")
for file in glob.glob("*.txt"):
with open(file, 'r') as f:
#Rest of your execution code here
EDIT: In answer to your question in the comments:
import re
import os
import glob
import pprint
candidateDataList = []
for file in glob.glob("*.txt"):
with open(file, 'r') as f:
#This pulls the first line of the resume,
#Which is generally the name.
first_line_name = f.readline().strip()
#This pulls the Email from the resume.
bulkemails = f.read()
r = re.compile(r'(\b[\w.]+#+[\w.]+.+[\w.]\b)')
candidateDataList.append({'name':str(first_line_name),
'email':r.findall(bulkemails)})
pprint.pprint(candidateDataList)

#Jakob's answer is spot on. I only wanted to mention a nice alternative which I usually prefer myself, the pathlib:
import re
import pprint
from pathlib import Path
resumes_dir = Path("resumes")
for path in resumes_dir.glob("*.txt"):
with path.open() as f:
#Rest of your execution code here

Related

python uploading a remote file to GCS , without saving it in the machine [duplicate]

I have managed to get my first python script to work which downloads a list of .ZIP files from a URL and then proceeds to extract the ZIP files and writes them to disk.
I am now at a loss to achieve the next step.
My primary goal is to download and extract the zip file and pass the contents (CSV data) via a TCP stream. I would prefer not to actually write any of the zip or extracted files to disk if I could get away with it.
Here is my current script which works but unfortunately has to write the files to disk.
import urllib, urllister
import zipfile
import urllib2
import os
import time
import pickle
# check for extraction directories existence
if not os.path.isdir('downloaded'):
os.makedirs('downloaded')
if not os.path.isdir('extracted'):
os.makedirs('extracted')
# open logfile for downloaded data and save to local variable
if os.path.isfile('downloaded.pickle'):
downloadedLog = pickle.load(open('downloaded.pickle'))
else:
downloadedLog = {'key':'value'}
# remove entries older than 5 days (to maintain speed)
# path of zip files
zipFileURL = "http://www.thewebserver.com/that/contains/a/directory/of/zip/files"
# retrieve list of URLs from the webservers
usock = urllib.urlopen(zipFileURL)
parser = urllister.URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
# only parse urls
for url in parser.urls:
if "PUBLIC_P5MIN" in url:
# download the file
downloadURL = zipFileURL + url
outputFilename = "downloaded/" + url
# check if file already exists on disk
if url in downloadedLog or os.path.isfile(outputFilename):
print "Skipping " + downloadURL
continue
print "Downloading ",downloadURL
response = urllib2.urlopen(downloadURL)
zippedData = response.read()
# save data to disk
print "Saving to ",outputFilename
output = open(outputFilename,'wb')
output.write(zippedData)
output.close()
# extract the data
zfobj = zipfile.ZipFile(outputFilename)
for name in zfobj.namelist():
uncompressed = zfobj.read(name)
# save uncompressed data to disk
outputFilename = "extracted/" + name
print "Saving extracted file to ",outputFilename
output = open(outputFilename,'wb')
output.write(uncompressed)
output.close()
# send data via tcp stream
# file successfully downloaded and extracted store into local log and filesystem log
downloadedLog[url] = time.time();
pickle.dump(downloadedLog, open('downloaded.pickle', "wb" ))

Below is a code snippet I used to fetch zipped csv file, please have a look:
Python 2:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(StringIO(resp.read()))
for line in myzip.open(file).readlines():
print line
Python 3:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
# or: requests.get(url).content
resp = urlopen("http://www.test.com/file.zip")
myzip = ZipFile(BytesIO(resp.read()))
for line in myzip.open(file).readlines():
print(line.decode('utf-8'))
Here file is a string. To get the actual string that you want to pass, you can use zipfile.namelist(). For instance,
resp = urlopen('http://mlg.ucd.ie/files/datasets/bbc.zip')
myzip = ZipFile(BytesIO(resp.read()))
myzip.namelist()
# ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']

My suggestion would be to use a StringIO object. They emulate files, but reside in memory. So you could do something like this:
# get_zip_data() gets a zip archive containing 'foo.txt', reading 'hey, foo'
import zipfile
from StringIO import StringIO
zipdata = StringIO()
zipdata.write(get_zip_data())
myzipfile = zipfile.ZipFile(zipdata)
foofile = myzipfile.open('foo.txt')
print foofile.read()
# output: "hey, foo"
Or more simply (apologies to Vishal):
myzipfile = zipfile.ZipFile(StringIO(get_zip_data()))
for name in myzipfile.namelist():
[ ... ]
In Python 3 use BytesIO instead of StringIO:
import zipfile
from io import BytesIO
filebytes = BytesIO(get_zip_data())
myzipfile = zipfile.ZipFile(filebytes)
for name in myzipfile.namelist():
[ ... ]

I'd like to offer an updated Python 3 version of Vishal's excellent answer, which was using Python 2, along with some explanation of the adaptations / changes, which may have been already mentioned.
from io import BytesIO
from zipfile import ZipFile
import urllib.request
url = urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/loc162txt.zip")
with ZipFile(BytesIO(url.read())) as my_zip_file:
for contained_file in my_zip_file.namelist():
# with open(("unzipped_and_read_" + contained_file + ".file"), "wb") as output:
for line in my_zip_file.open(contained_file).readlines():
print(line)
# output.write(line)
Necessary changes:
There's no StringIO module in Python 3 (it's been moved to io.StringIO). Instead, I use io.BytesIO]2, because we will be handling a bytestream -- Docs, also this thread.
urlopen:
"The legacy urllib.urlopen function from Python 2.6 and earlier has been discontinued; urllib.request.urlopen() corresponds to the old urllib2.urlopen.", Docs and this thread.
Note:
In Python 3, the printed output lines will look like so: b'some text'. This is expected, as they aren't strings - remember, we're reading a bytestream. Have a look at Dan04's excellent answer.
A few minor changes I made:
I use with ... as instead of zipfile = ... according to the Docs.
The script now uses .namelist() to cycle through all the files in the zip and print their contents.
I moved the creation of the ZipFile object into the with statement, although I'm not sure if that's better.
I added (and commented out) an option to write the bytestream to file (per file in the zip), in response to NumenorForLife's comment; it adds "unzipped_and_read_" to the beginning of the filename and a ".file" extension (I prefer not to use ".txt" for files with bytestrings). The indenting of the code will, of course, need to be adjusted if you want to use it.
Need to be careful here -- because we have a byte string, we use binary mode, so "wb"; I have a feeling that writing binary opens a can of worms anyway...
I am using an example file, the UN/LOCODE text archive:
What I didn't do:
NumenorForLife asked about saving the zip to disk. I'm not sure what he meant by it -- downloading the zip file? That's a different task; see Oleh Prypin's excellent answer.
Here's a way:
import urllib.request
import shutil
with urllib.request.urlopen("http://www.unece.org/fileadmin/DAM/cefact/locode/2015-2_UNLOCODE_SecretariatNotes.pdf") as response, open("downloaded_file.pdf", 'w') as out_file:
shutil.copyfileobj(response, out_file)

I'd like to add my Python3 answer for completeness:
from io import BytesIO
from zipfile import ZipFile
import requests
def get_zip(file_url):
url = requests.get(file_url)
zipfile = ZipFile(BytesIO(url.content))
files = [zipfile.open(file_name) for file_name in zipfile.namelist()]
return files.pop() if len(files) == 1 else files

write to a temporary file which resides in RAM
it turns out the tempfile module ( http://docs.python.org/library/tempfile.html ) has just the thing:
tempfile.SpooledTemporaryFile([max_size=0[,
mode='w+b'[, bufsize=-1[, suffix=''[,
prefix='tmp'[, dir=None]]]]]])
This
function operates exactly as
TemporaryFile() does, except that data
is spooled in memory until the file
size exceeds max_size, or until the
file’s fileno() method is called, at
which point the contents are written
to disk and operation proceeds as with
TemporaryFile().
The resulting file has one additional
method, rollover(), which causes the
file to roll over to an on-disk file
regardless of its size.
The returned object is a file-like
object whose _file attribute is either
a StringIO object or a true file
object, depending on whether
rollover() has been called. This
file-like object can be used in a with
statement, just like a normal file.
New in version 2.6.
or if you're lazy and you have a tmpfs-mounted /tmp on Linux, you can just make a file there, but you have to delete it yourself and deal with naming

Adding on to the other answers using requests:
# download from web
import requests
url = 'http://mlg.ucd.ie/files/datasets/bbc.zip'
content = requests.get(url)
# unzip the content
from io import BytesIO
from zipfile import ZipFile
f = ZipFile(BytesIO(content.content))
print(f.namelist())
# outputs ['bbc.classes', 'bbc.docs', 'bbc.mtx', 'bbc.terms']
Use help(f) to get more functions details for e.g. extractall() which extracts the contents in zip file which later can be used with with open.

All of these answers appear too bulky and long. Use requests to shorten the code, e.g.:
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("/path/to/directory")

Vishal's example, however great, confuses when it comes to the file name, and I do not see the merit of redefing 'zipfile'.
Here is my example that downloads a zip that contains some files, one of which is a csv file that I subsequently read into a pandas DataFrame:
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
import pandas
url = urlopen("https://www.federalreserve.gov/apps/mdrm/pdf/MDRM.zip")
zf = ZipFile(StringIO(url.read()))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])
(Note, I use Python 2.7.13)
This is the exact solution that worked for me. I just tweaked it a little bit for Python 3 version by removing StringIO and adding IO library
Python 3 Version
from io import BytesIO
from zipfile import ZipFile
import pandas
import requests
url = "https://www.nseindia.com/content/indices/mcwb_jun19.zip"
content = requests.get(url)
zf = ZipFile(BytesIO(content.content))
for item in zf.namelist():
print("File in zip: "+ item)
# find the first matching csv file in the zip:
match = [s for s in zf.namelist() if ".csv" in s][0]
# the first line of the file contains a string - that line shall de ignored, hence skiprows
df = pandas.read_csv(zf.open(match), low_memory=False, skiprows=[0])

It wasn't obvious in Vishal's answer what the file name was supposed to be in cases where there is no file on disk. I've modified his answer to work without modification for most needs.
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
def unzip_string(zipped_string):
unzipped_string = ''
zipfile = ZipFile(StringIO(zipped_string))
for name in zipfile.namelist():
unzipped_string += zipfile.open(name).read()
return unzipped_string

Use the zipfile module. To extract a file from a URL, you'll need to wrap the result of a urlopen call in a BytesIO object. This is because the result of a web request returned by urlopen doesn't support seeking:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
zip_url = 'http://example.com/my_file.zip'
with urlopen(zip_url) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read())
If you already have the file downloaded locally, you don't need BytesIO, just open it in binary mode and pass to ZipFile directly:
from zipfile import ZipFile
zip_filename = 'my_file.zip'
with open(zip_filename, 'rb') as f:
with ZipFile(f) as myzipfile:
foofile = myzipfile.open('foo.txt')
print(foofile.read().decode('utf-8'))
Again, note that you have to open the file in binary ('rb') mode, not as text or you'll get a zipfile.BadZipFile: File is not a zip file error.
It's good practice to use all these things as context managers with the with statement, so that they'll be closed properly.

`open` does not create the file

I am working on a Tkinter app.
I want to add premium features to it, I will sell an exe file that will generate a premium code.
here is the script:
import requests
import os
import random
from pathlib import Path
url = 'a website with all of the premium codes in a txt file'
r = requests.get(url)
code = list(map(str, r.text.split()))
appdata = os.getenv("APPDATA")
data = Path(f"{appdata}\\premiumaccess.txt")
if not data.is_file():
open(f"{appdata}premiumaccess.txt" ,'a')
with open(f"{appdata}\\premiumaccess.txt", 'r') as file:
if '1' in file:
print("You already claimed your premium code...")
else:
print(f'{random.choice(code)}\n\nThis is your AccTools premium code! do not share it with anyone, have fun!')
with open(f"{appdata}\\premiumaccess.txt", 'w') as file:
file.truncate()
file.write("1")
The code gives me the premium code, but does not create any file called premiumaccess and writes data in it.
Please help me.
(again, sorry if my English is bad, this is not my main language, ty and sorry)

I think the problem is here:
if not data.is_file():
open(f"{appdata}premiumaccess.txt" ,'a')
Missing the backash
I suggest use a variable for the path with os.path.join:
file_path = os.path.join(os.getenv("APPDATA"), "premiumaccess.txt")

Use the context manager for manipulating your text files. That way your files are properly closed. 👍🏿

Issue writting to file with pyinstaller

So an update, I found my compile issue was that I needed to change my notebook to a py file and choosing save as doesn't do that. So I had to run a different script turn my notebook to a py file. And part of my exe issue was I was using the fopen command that apparently isn't useable when compiled into a exe. So I redid the code to what is above. But now I get a write error when trying to run the script. I can not find anything on write functions with os is there somewhere else I should look?
Original code:
import requests
import json
import pandas as pd
import csv
from pathlib import Path
response = requests.get('url', headers={'CERT': 'cert'}, stream=True).json()
json2 = json.dumps(response)
f = open('data.json', 'r+')
f.write(json2)
f.close()
Path altered code:
import requests
import json
import pandas as pd
import csv
from pathlib import Path
response = requests.get('url', headers={'CERT': 'cert'}, stream=True).json()
json2 = json.dumps(response)
filename = 'data.json'
if '_MEIPASS2' in os.environ:
filename = os.path.join(os.environ['_MEIPASS2'], filename)
fd = open(filename, 'r+')
fd.write(json2)
fd.close()
The changes to the code allowed me to get past the fopen issue but created a write issue. Any ideas?

If you want to write to a file, you have to open it as writable.
fd = open(filename, 'wb')
Although I don't know why you're opening it in binary if you're writing text.

Why can't I create a csv file with the csv module?

I am trying to write a list to a csv file.
the following code runs and returns no error, but it doesnt work, in that it doesnt actually populate the csv file with the stuff in the list. I am probably doing it wrong because I don't understand something.
import newspaper
import os
from newspaper import article
libya_newspaperlist = []
libya_newspaper=newspaper.build('https://www.cnn.com', memoize_article=False)
for article in libya_newspaper.articles:
libya_newspaperlist.append(article.url)
import csv
os.chdir("/users/patrickharned/")
libya_newspaper.csv = "/users/patrickharned/libya_newspaper.csv"
def write_list_to_file(libya_newspaperlist):
"""Write the list to csv file."""
with open("/users/patrickharned/libya_newspaper.csv") as outfile:
outfile.write(libya_newspaperlist)
So I changed the code to this.
import newspaper
import os
from newspaper import article
libya_newspaperlist = []
libya_newspaper=newspaper.build('https://www.cnn.com', memoize_article=False)
for article in libya_newspaper.articles:
libya_newspaperlist.append(article.url)
import csv
os.chdir("/users/patrickharned/")
libya_newspaper.csv = "/users/patrickharned/libya_newspaper.csv"
with open("/users/patrickharned/libya_newspaper.csv", "w") as outfile:
outfile.write(str(libya_newspaperlist))
now it does output to the csv file, but it only outputs the first entry and wont do the rest. any suggestions?

You have to open the file in write mode:
with open("/users/patrickharned/libya_newspaper.csv", "w") as outfile:
outfile.write(libya_newspaperlist)

Combining multiple CSV files into 1

I have a python code that takes multiple text files as input and generates output in separate CSV file so if my text files are ABC.txt and XYX.txt then my code is generating output in 2 CSV files ABC.csv and XYX.csv. My ultimate goal is get one single CSV file with all the outputs. Since I am more comfortable with sql I was thinking about uploading all the files to a database and then combine them using sql but I was wondering if I can modify my python code below to generate one single CSV file containing all output. Here is my code:
import json
from watson_developer_cloud import ToneAnalyzerV3Beta
import urllib.request
import codecs
import csv
import os
import re
import sys
import collections
import glob
import xlwt
from bs4 import BeautifulSoup
ipath = 'C:/TEMP/' # input folder
opath = 'C:/TEMP/' # output folder
reader = codecs.getreader("utf-8")
tone_analyzer = ToneAnalyzerV3Beta(
url='https://gateway.watsonplatform.net/tone-analyzer/api',
username='1f2fd51b-d0fb-45d8-aba2-08e22777b77d',
password='DykYfXjV4UXP',
version='2016-02-11')
path = 'C:/TEMP/*.html'
file = glob.glob(path)
# iterate over the list getting each file
writer = csv.writer(open('C:/TEMP/test', mode='w'))
# now enter our input loop
for fle in file:
# open the file and then call .read() to get the text
with open(fle) as f:
...
# output tone name and score to file
for i in tonename:
writer.writerows((tone['tone_name'],tone['score']) for tone in cat['tones'])

Modifying your existing code as little as possible ... you simply need to open the csv file before entering your loop that reads the text files:
...
path = 'C:/TEMP/*.html'
file = glob.glob(path)
# !! open our output csv
writer = csv.writer(open('our-merged-data', mode='w'))
# iterate over the list getting each file
for fle in file:
# open the file and then call .read() to get the text
with open(fle) as f:
...
# output tone name and score to file
for i in tonename:
writer.writerows((tone['tone_name'],tone['score'],Date,Title) for tone in cat['tones'])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

running data parser on multiple files in folder? Python - python

#Jakob's answer is spot on. I only wanted to mention a nice alternative which I usually prefer myself, the pathlib: import re import pprint from pathlib import Path resumes_dir = Path("resumes") for path in resumes_dir.glob("*.txt"): with path.open() as f: #Rest of your execution code here

Related

python uploading a remote file to GCS , without saving it in the machine [duplicate]

`open` does not create the file

Issue writting to file with pyinstaller

Why can't I create a csv file with the csv module?

Combining multiple CSV files into 1

Categories

Resources