I am trying to iterate through pdfs to extract information from emails. My individual regex statements work when I try them on individual examples, however, when I try to put all the code together in a for loop to iterate over multiple pdfs at once, I am unable to append to my aggregate df (I'm currently just creating an empty df). I need to use the try/except because not all emails have all fields (eg. some do not have the 'Attachments' field). Below is the code I have written so far:
import os
import pandas as pd
pd.options.display.max_rows=999
import numpy
from numpy import NaN
from tika import parser
root = r"my_dir"
agg_df = pd.DataFrame()
for directory, subdirectory, files in os.walk(root):
for file in files:
filepath = os.path.join(directory, file)
print(file)
raw = parser.from_file(filepath)
img = raw['content']
img = img.replace('\n', '')
try:
from_field = re.search(r'From:(.*?)Sent:', img).group(1)
except:
pass
try:
sent_field = re.search(r'Sent:(.*?)To:', img).group(1)
except:
pass
try:
to_field = re.search(r'To:(.*?)Cc:', img).group(1)
except:
pass
try:
cc_field = re.search(r'Cc:(.*?)Subject:', img).group(1)
except:
pass
try:
subject_field = re.search(r'Subject:(.*?)Attachments:', img).group(1)
except:
pass
try:
attachments_field = re.search(r'Attachments:(.*?)NOTICE', img).group(1)
except:
pass
img_df = pd.DataFrame(columns=['From', 'Sent', 'To',
'Cc', 'Subject', 'Attachments'])
img_df['From'] = from_field
img_df['Sent'] = sent_field
img_df['To'] = to_field
img_df['Cc'] = cc_field
img_df['Subject'] = subject_field
img_df['Attachments'] = attachments_field
agg_df = agg_df.append(img_df)
There are two things:
When you don't get a match you shouldn't just pass the exception.
You should use a default value.
Don't append to your dataframe after each time through the loop.
That is slow. Keep everything in a dictionary, and then construct
the dataframe at the end.
E.g.
from collections import defaultdict
data = defaultdict(list)
for directory, _, files in os.walk(root):
for file in files:
filepath = os.path.join(directory, file)
print(file)
raw = parser.from_file(filepath)
img = raw['content']
img = img.replace('\n', '')
from_match = re.search(r'From:(.*?)Sent:', img)
if not from_match:
sent_by = None
else:
sent_by = from_match.group(1)
data["from"].append(sent_by)
to_match = re.search(r'Sent:(.*?)To:', img)
if not to_match:
sent_to = None
else:
sent_to = to_match.group(1)
data["to"].append(sent_to)
# All your other regexes
df = pd.DataFrame(data)
Also, if you're doing this for a lot of files you should look into using compiled expression.
Related
I have a script that collects Reddit comments. It pulls from a csv file with a list of links in it. Some of the links are dead and I get 404/403/etc errors. The code below will correctly identify them and skip, but it then exits the loop and completes the process of making the csv file without continuing onto the next link.
import praw
import pprint
import csv
import os
import pandas as pd
from collections import namedtuple
from datetime import datetime
from pathlib import Path
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
try:
for pid in df.id:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
if __name__ == "__main__":
reddit_api = praw.Reddit(
client_id="####",
client_secret="####",
user_agent="####",
username="####",
password="####"
)
# reddit_api = init_praw(client_id, client_secret, user_agent, username, password)
csv_file = "####"
dest_dir = "####"
dest_name = "reddits_comments.csv"
Path(dest_dir).mkdir(parents=True, exist_ok=True)
dest = os.path.join(dest_dir, dest_name)
scrape_comments(reddit_api, csv_file, dest)
You should put the try/except around a smaller portion of your code, as said in the comments. Here's an illustration of that:
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
for pid in df.id:
try:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except Exception:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
I want to create and repeatedly append to a csv file using DataLakeServiceClient(azure.storage.filedatalake package). The Inital create/write works as follows.
from azure.storage.filedatalake import DataLakeServiceClient
datalake_service_client = DataLakeServiceClient.from_connection_string(connect_str)
myfilesystem = "ContainerName"
myfolder = "FolderName"
myfile = "FileName.csv"
file_system_client = datalake_service_client.get_file_system_client(myfilesystem)
try:
directory_client = file_system_client.create_directory(myfolder)
except Exception as e:
directory_client = file_system_client.get_directory_client(myfolder)
file_client = directory_client.create_file(myfile)
data = """Test1"""
file_client.append_data(data, offset=0, length=len(data))
file_client.flush_data(len(data))
Suppose the next append is for data = """Test2""", how to set the offset and flush_data?
Thanks.
First, you are using directory_client.create_file(myfile), this will create the new file every time. So your code will never append any content.
Second, you need to add a judgment condition to check whether it exists, if it exists, use the get_file_client method. If not exists, use the create_file method. Total code is like below:(On my side, I am using .txt file to test.)
from azure.storage.filedatalake import DataLakeServiceClient
connect_str = "DefaultEndpointsProtocol=https;AccountName=0730bowmanwindow;AccountKey=xxxxxx;EndpointSuffix=core.windows.net"
datalake_service_client = DataLakeServiceClient.from_connection_string(connect_str)
myfilesystem = "test"
myfolder = "test"
myfile = "FileName.txt"
file_system_client = datalake_service_client.get_file_system_client(myfilesystem)
directory_client = file_system_client.create_directory(myfolder)
directory_client = file_system_client.get_directory_client(myfolder)
print("11111")
try:
file_client = directory_client.get_file_client(myfile)
file_client.get_file_properties().size
data = "Test2"
print("length of data is "+str(len(data)))
print("This is a test123")
filesize_previous = file_client.get_file_properties().size
print("length of currentfile is "+str(filesize_previous))
file_client.append_data(data, offset=filesize_previous, length=len(data))
file_client.flush_data(filesize_previous+len(data))
except:
file_client = directory_client.create_file(myfile)
data = "Test2"
print("length of data is "+str(len(data)))
print("This is a test")
filesize_previous = 0
print("length of currentfile is "+str(filesize_previous))
file_client.append_data(data, offset=filesize_previous, length=len(data))
file_client.flush_data(filesize_previous+len(data))
On my side it is no problem, please have a try on your side.(The above is just an example, you can design better and streamlined.)
I'm trying to create multiple good/bad files from original .csv files from a directory.
Im fairly new to Python, but have cobbled together the below, but it's not saving multiple files, just x1 "good" and x1 "bad" file. in the dir i have testfile1 and testfile2. the output should be testfile1good testfile1bad testfile2good testfile2bad.
Any help would be greatly appreciated.
Thanks
import pandas as pd
from string import ascii_letters
import glob
from pathlib import Path
files = glob.glob('C:\\Users\\nickn\\OneDrive\\Documents\\Well\\*.csv')
for f in files:
filename = []
filename = Path(f)
#Can not be null fields
df = pd.read_csv(f)
emptyvals = []
emptyvals = df['First Name'].isnull() | df['Last Name'].isnull()
#Bank Account Number is not 8 digits long
accountnolen = []
ac = []
accountnolen = df['AccNumLen'] = df['Bank Account Number'].astype(str).map(len)
ac = df[(df['AccNumLen'] != 8)]
acd= ac.drop(['AccNumLen'],axis=1)
#Create Exclusions
allexclusions = []
allexclusions = df[emptyvals].append(acd)
allexclusions.to_csv(filename.stem+"bad.csv",header =True,index=False)
#GoodList
#for f in files:
# filename = []
# filename = Path(f)
origlist = df
df = pd.merge(origlist, allexclusions, how='outer', indicator=True)
cl = df[(df['_merge'] == 'left_only')]
cld = cl.drop(['_merge','AccNumLen'],axis=1)
cld['Well ID'] = cld['Well ID'].str.rstrip(ascii_letters)
cld.to_csv(filename.stem+'good.csv',header =True,index=False)
i think you do loop but leave it and do the rest on line 14 - there you have filename set and you save your data once.
What you want is do the loop and the rest should happen for each iteration, so code should look like this:
import pandas as pd
from string import ascii_letters
import glob
from pathlib import Path
files = glob.glob('C:\\Users\\nickn\\OneDrive\\Documents\\Well\\*.csv')
for f in files:
filename = []
filename = Path(f)
#EDIT: we stay in loop and process each file one by one with following lines:
#Can not be null fields
df = pd.read_csv(f)
emptyvals = []
emptyvals = df['First Name'].isnull() | df['Last Name'].isnull()
#Bank Account Number is not 8 digits long
accountnolen = []
ac = []
accountnolen = df['AccNumLen'] = df['Bank Account Number'].astype(str).map(len)
ac = df[(df['AccNumLen'] != 8)]
acd= ac.drop(['AccNumLen'],axis=1)
#Create Exclusions
allexclusions = []
allexclusions = df[emptyvals].append(acd)
allexclusions.to_csv(filename.stem+"bad.csv",header =True,index=False)
#GoodList
#for f in files:
# filename = []
# filename = Path(f)
origlist = df
df = pd.merge(origlist, allexclusions, how='outer', indicator=True)
cl = df[(df['_merge'] == 'left_only')]
cld = cl.drop(['_merge','AccNumLen'],axis=1)
cld['Well ID'] = cld['Well ID'].str.rstrip(ascii_letters)
cld.to_csv(filename.stem+'good.csv',header =True,index=False)
In another words - you iterate over file names found in directory and THEN you take last "filename" and process it in one pass. By adding 4 spaces to rest of code we say to python interpreter that this part of code is part of loop and should be executed for each file. Hope it makes sense
I am trying to merge all the values into one list when I run my for loop. However I keep getting to separate brackets in one list.
For example, when I run this code:
import glob
import re
#import PyPDF2
folder_path='/Users/my_path/cb_files'
file_pattern = "/*"
folder_contents = glob.glob(folder_path + file_pattern, recursive=True)
#IP Bank
import re
ip = re.compile(r"((?:^|\b)(?:h[tTxX]ps?://)?(?:\d{1,3}\[?\.\]?){3}\d{1,3}(?:\b|$))")
hash_ = re.compile(r"((?:^|\b)[a-fA-F0-9]{32,64}(?:\b|$))")
domain = re.compile(r"((?:^|\b)(?:h[xXtT]{2}ps?:|meows?:)?//(?:[a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDFC\uFDF0-\uFFEF_.\[\]-]+)(?:\[?\.\]?[a-z]+)+(?::\d+)?(?:[/?][^\s]*)?)")
ip_list=[]
for file in folder_contents:
if re.search(r".*(?=pdf$)",file):
#this is pdf
pdfFileObj = open('pdf.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
read_file=pageObj.extractText()
elif '.' not in file:
continue
else:
read_file = open(file, 'rt', encoding="latin-1").read()
if ip.findall(read_file) or hash_.findall(read_file) or domain.findall(read_file):
ips =ip.findall(read_file)
hashs= hash_.findall(read_file)
domains=domain.findall(read_file)
# print("IPS",', '.join(ips))
ip_list.append(ips)
print(ip_list)
Here is my output:
[['000.000.0.1', '111.111.1.1'], ['222.222.2.2','333.333.3.3']]
So it looks like for each file it loops over, it is putting it in its own list.
I want the output to look like this:
['000.000.0.1', '111.111.1.1','222.222.2.2','333.333.3.3']
Any changes in my code that will produce these results?
Try changing:-
ip_list.append(ips)
to
ip_list.extend(ips)
I've been trying to get this to work, but I feel like I'm missing something. There is a large collection of images in a folder that I need to rename just part of the filename. For example, I'm trying to rename the "RJ_200", "RJ_600", and "RJ_60"1 all to the same "RJ_500", while keeping the rest of the filename intact.
Image01.Food.RJ_200.jpg
Image02.Food.RJ_200.jpg
Image03.Basket.RJ_600.jpg
Image04.Basket.RJ_600.jpg
Image05.Cup.RJ_601.jpg
Image06.Cup.RJ_602.jpg
This is what I have so far, but it keeps just giving me the "else" instead of actually renaming any of them:
import os
import fnmatch
import sys
user_profile = os.environ['USERPROFILE']
dir = user_profile + "\Desktop" + "\Working"
print (os.listdir(dir))
for images in dir:
if images.endswith("RJ_***.jpg"):
os.rename("RJ_***.jpg", "RJ_500.jpg")
else:
print ("Arg!")
The Python string method endswith does not do pattern-matching with *, so you're looking for filenames which explicitly include the asterisk character and not finding any.
Try using regular expressions to match your filenames and then building your target filename explicitly:
import os
import re
patt = r'RJ_\d\d\d'
user_profile = os.environ['USERPROFILE']
path = os.path.join(user_profile, "Desktop", "Working")
image_files = os.listdir(path)
for filename in image_files:
flds = filename.split('.')
try:
frag = flds[2]
except IndexError:
continue
if re.match(patt, flds[2]):
from_name = os.path.join(path, filename)
to_name = '.'.join([flds[0], flds[1], 'RJ_500', 'jpg'])
os.rename(from_name, os.path.join(path, to_name))
Note that you need to do your matching with the file's basename and join on the rest of the path later.
You don't need to use .endswith. You can split the image file name up using .split and check the results. Since there are several suffix strings involved, I've put them all into a set for fast membership testing.
import os
import re
import sys
suffixes = {"RJ_200", "RJ_600", "RJ_601"}
new_suffix = "RJ_500"
user_profile = os.environ["USERPROFILE"]
dir = os.path.join(user_profile, "Desktop", "Working")
for image_name in os.listdir(dir):
pieces = image_name.split(".")
if pieces[2] in suffixes:
from_path = os.path.join(dir, image_name)
new_name = ".".join([pieces[0], pieces[1], new_suffix, pieces[3]])
to_path = os.path.join(dir, new_name)
print("renaming {} to {}".format(from_path, to_path))
os.rename(from_path, to_path)