I have a script that collects Reddit comments. It pulls from a csv file with a list of links in it. Some of the links are dead and I get 404/403/etc errors. The code below will correctly identify them and skip, but it then exits the loop and completes the process of making the csv file without continuing onto the next link.
import praw
import pprint
import csv
import os
import pandas as pd
from collections import namedtuple
from datetime import datetime
from pathlib import Path
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
try:
for pid in df.id:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
if __name__ == "__main__":
reddit_api = praw.Reddit(
client_id="####",
client_secret="####",
user_agent="####",
username="####",
password="####"
)
# reddit_api = init_praw(client_id, client_secret, user_agent, username, password)
csv_file = "####"
dest_dir = "####"
dest_name = "reddits_comments.csv"
Path(dest_dir).mkdir(parents=True, exist_ok=True)
dest = os.path.join(dest_dir, dest_name)
scrape_comments(reddit_api, csv_file, dest)
You should put the try/except around a smaller portion of your code, as said in the comments. Here's an illustration of that:
def scrape_comments(reddit_api, csv_file, dest):
df = pd.read_csv(csv_file)
data = []
for pid in df.id:
try:
# post_comment = []
submission = reddit_api.submission(id=pid)
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
# post_comment.append(comment.body)
data.append((pid, comment.id, comment.parent_id, comment.body, comment.link_id,comment.author, comment.score, comment.created_utc, comment.subreddit))
# data.append((pid, ";".join(post_comment)))
except Exception:
print ("Error! Skip the Current subreddit")
df = pd.DataFrame(data, columns=["post_id", "comment_id", "comment_parent_id", "comment_body", "comment_link_id","comment_author", "comment_score","comment_created","comment_subreddit"]) # append tuple
df.to_csv(dest, index=False, encoding='utf-8')
Related
Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :
Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this
i am having issues get a nested for loop to output individual csv files for an API call. The API call is paginated, so we have to query the API multiple times and append the data Also have to loop through for every exchange.
The way the code is now it's only outputting the last page of data for a couple of exchanges and the the following exchanges just have 'name' in the CSV, no other data...
from pycoingecko import CoinGeckoAPI
cg = CoinGeckoAPI()
import pandas as pd
import time
##grab a list of all the exchangeslisted on CG
ex_list = cg.get_exchanges_list()
#normalise the json
df = pd.json_normalize(ex_list)
#output to csv
#df.to_csv('exchange_list.csv', encoding='utf-8', index=False)
#make a list with just one column
id_list = df['id'].to_list()
def read_exchange_tickers():
for x in id_list:
for i in range(1,10):
appended_data = []
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
#time.sleep(10)
#define path + filename
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
read_exchange_tickers()
You should collect all data for each id and then save the data to file.
def read_exchange_tickers():
for x in id_list:
appended_data = []
# collect all the data for current id
for i in range(1,10):
data = cg.get_exchanges_tickers_by_id(x, page = str(i))
appended_data.append(data)
# save the data to csv
path = 'ticker_lists/'
filename = path + x + '_' + '.csv'
appended_data = pd.json_normalize(appended_data, record_path=['tickers'], meta=['name'])
appended_data.to_csv(filename, encoding='utf-8', index=False)
time.sleep(10)
I am trying to iterate through pdfs to extract information from emails. My individual regex statements work when I try them on individual examples, however, when I try to put all the code together in a for loop to iterate over multiple pdfs at once, I am unable to append to my aggregate df (I'm currently just creating an empty df). I need to use the try/except because not all emails have all fields (eg. some do not have the 'Attachments' field). Below is the code I have written so far:
import os
import pandas as pd
pd.options.display.max_rows=999
import numpy
from numpy import NaN
from tika import parser
root = r"my_dir"
agg_df = pd.DataFrame()
for directory, subdirectory, files in os.walk(root):
for file in files:
filepath = os.path.join(directory, file)
print(file)
raw = parser.from_file(filepath)
img = raw['content']
img = img.replace('\n', '')
try:
from_field = re.search(r'From:(.*?)Sent:', img).group(1)
except:
pass
try:
sent_field = re.search(r'Sent:(.*?)To:', img).group(1)
except:
pass
try:
to_field = re.search(r'To:(.*?)Cc:', img).group(1)
except:
pass
try:
cc_field = re.search(r'Cc:(.*?)Subject:', img).group(1)
except:
pass
try:
subject_field = re.search(r'Subject:(.*?)Attachments:', img).group(1)
except:
pass
try:
attachments_field = re.search(r'Attachments:(.*?)NOTICE', img).group(1)
except:
pass
img_df = pd.DataFrame(columns=['From', 'Sent', 'To',
'Cc', 'Subject', 'Attachments'])
img_df['From'] = from_field
img_df['Sent'] = sent_field
img_df['To'] = to_field
img_df['Cc'] = cc_field
img_df['Subject'] = subject_field
img_df['Attachments'] = attachments_field
agg_df = agg_df.append(img_df)
There are two things:
When you don't get a match you shouldn't just pass the exception.
You should use a default value.
Don't append to your dataframe after each time through the loop.
That is slow. Keep everything in a dictionary, and then construct
the dataframe at the end.
E.g.
from collections import defaultdict
data = defaultdict(list)
for directory, _, files in os.walk(root):
for file in files:
filepath = os.path.join(directory, file)
print(file)
raw = parser.from_file(filepath)
img = raw['content']
img = img.replace('\n', '')
from_match = re.search(r'From:(.*?)Sent:', img)
if not from_match:
sent_by = None
else:
sent_by = from_match.group(1)
data["from"].append(sent_by)
to_match = re.search(r'Sent:(.*?)To:', img)
if not to_match:
sent_to = None
else:
sent_to = to_match.group(1)
data["to"].append(sent_to)
# All your other regexes
df = pd.DataFrame(data)
Also, if you're doing this for a lot of files you should look into using compiled expression.
Currently I am grabbing a excel file from a folder with Python just fine; in the below code.. and pushing this to a web form via selenium.
However, I am trying to modify this to continue to go through a directory over multiple files. (there will be many excel files in my 'directory' or 'folder').
main.py
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
if __name__ == "__main__":
try:
#Instantiates FindPendingRecords then gets records to process
PENDING_RECORDS = FindPendingRecords().get_excel_data()
#Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
#Configures Driver for vital
VITAL_ENTRY = VitalEntry()
#Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
#Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
print("All done, Bill")
except Exception as exc:
print(exc)
config.py
FILE_LOCATION = r"C:\Zip\2019.02.12 Data Docs.zip"
UNZIP_LOCATION = r"C:\Zip\Pending"
VITAL_URL = 'http://boringdatabasewebsite:8080/Horrible'
HEADLESS = False
PROCESSORS = 4
MAPPING_DOC = ".//map/mapping.xlsx"
find_pending_records.py
"""Module used to find records that need to be inserted into Horrible website"""
from zipfile import ZipFile
import math
import pandas
import config
class FindPendingRecords:
"""Class used to find records that need to be inserted into Site"""
#classmethod
def find_file(cls):
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
excel_data = pandas.read_excel(self.find_file())
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
num_valid_records = 0
for row in excel_data.itertuples():
person = row.PERSON
if person in ("", " ", None) or math.isnan(mrn):
print(f"Invalid record: {row}")
excel_data = excel_data.drop(excel_data.index[row.Index])
else:
num_valid_records += 1
print(f"Processing #{num_valid_records} records")
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['PERSON'] = data_frame['PERSON'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
One way is as below (pseudocode)
class FindPendingRecords:
#classmethod
def find_file(cls):
return ["file1", "file2", "file3"]
def __init__(self):
self.files = self.find_file()
def get_excel_data(self):
for excel_data in self.files:
# process your excel_data
yield excel_data
Your main should be
if __name__ == "__main__":
try:
for PENDING_RECORDS in FindPendingRecords().get_excel_data():
# Do operations on PENDING_RECORDS
print (PENDING_RECORDS)
print("All done, Bill")
except Exception as exc:
print(exc)
Your find_file method will be
#classmethod
def find_file(cls):
all_files = list()
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
all_files.append(archive.extract(file.filename, config.UNZIP_LOCATION))
return all_files
I am trying to convert xls to json and but when I am executing the code it's not giving me the data inside xls sheet, it's only giving me the json structure.
Below is the code which I am running, I am not able to understand what modification I should further make in this so that I can get a perfect json file.
Please note - input is in the form of binary stream and output is also in the form of a stream and not file.
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
for sheet_index in range(wb.nsheets):
# print sheet_index
sh = wb.sheet_by_index(sheet_index)
# print "Processing sheet no ", sheet_index
attributes = sh.row_values(0)
#print attributes
rows_list = []
attr_list = []
# print attr_list[0]
for rownum in range(1,sh.nrows):
row_val_list = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_val_list[index]
#row_dict['ID'] = row_val_list[0]
#row_dict['Name'] = row_val_list[1]
#rows_list.append(row_dict)
#json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)
# json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
~
Any help is much appreciated
here is the correct working python code
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
#print "Sheets are .... ", wb.nsheets
for sheet_index in range(wb.nsheets):
sh = wb.sheet_by_index(sheet_index)
if sh.nrows == 0:
continue
attr_list = sh.row_values(0)
rows_list = []
for rownum in range(1,sh.nrows):
row_values = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_values[index]
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)