Blank Strings Are Returned in Python Dataframe - python

I wrote a code to convert PDF to CSV, read the CSV file, and export only relevant information from the CSV file. The function is supposed to return filtered information such as english_name: 'someones name', original_language_name: 'someones name' etc, but instead the command returned english_name: '', original_language_name: '' etc. Below is the code that I wrote:
import pandas as pd
import tabula
from pandas import DataFrame
from backend.classes import Shareholder, Officer
from typing import List
def strip_string(string):
return str(string).strip()
def get_float_without_thousands_separator(string, thousands_separator):
return float(string.replace(thousands_separator, ''))
def extract_officers_and_shareholders_lists_from_df(df, total_number_of_shares, no_data_placeholder, number_of_shares, thousands_separator):
officers = []
shareholders = []
NAME = 'Nama'
POSITION = 'Jabatan'
for row in range((df.shape[0])):
if str(df[POSITION][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
position = strip_string(df[POSITION][row])
officer = Officer(english_name=english_name, original_language_name=original_language_name, position=position)
officers.append(officer)
elif str(df[number_of_shares][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
number_of_shares_string = strip_string(df[number_of_shares][row])
number_of_shares_number = get_float_without_thousands_separator(number_of_shares_string, thousands_separator)
shareholding_percentage = (number_of_shares_number / total_number_of_shares) * 100
shareholder = Shareholder(english_name=english_name, original_language_name=original_language_name, shareholding_percentage=shareholding_percentage)
shareholders.append(shareholder)
return officers, shareholders
def get_officers_and_shareholders_lists(pdf_input_file):
NO_DATA_PLACEHOLDER = '-'
NUMBER_OF_SHARES = 'Jumlah Lembar Saham'
THOUSANDS_SEPARATOR = '.'
output_file_path = 'CSV/Officers_and_Shareholders.csv'
tabula.convert_into(pdf_input_file, output_file_path, output_format='csv', pages='all')
df = pd.read_csv(output_file_path, header=3, on_bad_lines='skip')
all_shares = df[NUMBER_OF_SHARES].to_list()
all_shares_strings = [strip_string(shares) for shares in all_shares if strip_string(shares) != NO_DATA_PLACEHOLDER]
all_shares_numbers = [get_float_without_thousands_separator(shares, THOUSANDS_SEPARATOR) for shares in all_shares_strings]
total_number_of_shares = sum(all_shares_numbers)
return extract_officers_and_shareholders_lists_from_df(
df=df,
total_number_of_shares=total_number_of_shares,
number_of_shares=NUMBER_OF_SHARES,
no_data_placeholder=NO_DATA_PLACEHOLDER,
thousands_separator=THOUSANDS_SEPARATOR)
The command call that I use for the codes on the above is python3 -m backend.officers_and_shareholders. Is there a method to pass in so that english_name returns a name, original_language_name returns a name?

Related

pylucence cannot find a word that was presented in the text which indexed earlier

I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.

MapReduce on several columns

I am new to MapReduce and have troubles with writing a MapReduce job which would work with several columns at once for a CSV file.
I would like to find for each garment group the most frequent product, the second most frequent section and the most frequent department so that the output would look according to the following schema: garment_group, product, section, department. This is based on the articles.csv dataset from kaggle.
So far I could only find for each garment group the most frequent product and do not understand how to incorporate the other columns. This is my code:
from mrjob.step import MRStep
from mrjob.util import log_to_stream, log_to_null
from mr3px.csvprotocol import CsvProtocol
import csv
import logging
log = logging.getLogger(__name__)
class MyMRJob1(MRJob):
OUTPUT_PROTOCOL = CsvProtocol # write output as CSV
def set_up_logging(cls, quiet=False, verbose=False, stream=None):
log_to_stream(name='mrjob', debug=verbose, stream=stream)
log_to_stream(name='__main__', debug=verbose, stream=stream)
def mapper(self, _, line):
result = next(csv.reader([line],quotechar=None)) # extract columns from line
garment_group_name = result[23]
prod_name = result[2]
#section_name = result[21]
#department_name = result[15]
# name = result[2]
#skip sparse entries and header
if prod_name == "prod_name" or prod_name == "" or garment_group_name == "": #skip sparse entries and header
return
yield (garment_group_name,prod_name), 1
def reducer(self,garmetProd,valuelist):
garmet, prod = garmetProd
output = sum(valuelist)
yield None,(garmet,prod,output)
def mapper_top(self, _, line):
result = line # input from last round already a list of strings
garmet = result[0]
prod = result[1]
nProd = result[2]
yield garmet, (prod,nProd)
def reducer_top(self,garmet,values):
mostProduct = "" # most favourite product per garmet group
maxBought = 0 # max amount of times bought
for (prod,nProds) in values:
if int(nProds) > maxBought:
maxBought = int(nProds)
mostProduct = prod
if maxBought > 0:
#CsvProtocol needs None key for output
yield None, (garmet,mostProduct)
def steps(self):
return [
MRStep(mapper = self.mapper,
reducer = self.reducer),
MRStep(mapper = self.mapper_top,
reducer = self.reducer_top)
]
if __name__ == '__main__':
MyMRJob1.run()

How to pass two or more dataframes from a module to main script

Edit with #RJ Adriaansen update:
I'm trying to pull two or more dataframes from a module so that I can use the data in the main script.
I only get 4 empty dataframes returned from the df_make module.
The main and df_make codes are below.
Any advice would be great thanks.
import pandas as pd
import df_make
df_trn = pd.DataFrame()
df_trn_trk = pd.DataFrame()
df_jky = pd.DataFrame()
df_jky_code = pd.DataFrame()
def main():
df_make.jky_trn(df_trn, df_trn_trk, df_jky, df_jky_code)
#df_make.jky_trn([df_trn])
print(df_trn)
print(df_trn_trk)
print(df_jky)
print(df_jky_code)
if __name__ == '__main__':
main()
import pandas as pd
#def jky_trn(df_trn):
def jky_trn(df_trn, df_trn_trk, df_jky, df_jky_code):
#global df_trn
#global df_trn_trk
#global df_jky
#global df_jky_code
path = (r"C:\Users\chris\Documents\UKHR\PythonSand\PY_Scripts\StackOF")
xls_tbl = "\Racecards.xlsx"
xls_link = path + xls_tbl
df1 = pd.read_excel(xls_link, usecols=["Jockey","Course","RaceDesc"])
df2 = pd.read_excel(xls_link, usecols=["Trainer","Course","RaceDesc"])
df1 = df1.drop_duplicates(subset=["Jockey","Course","RaceDesc"])
df1 = df1.dropna() # Remove rows with NaN
df1['Course'] = df1['Course'].str.replace(' \(AW\)', '') #Replace (AW) in Course
df2['Course'] = df2['Course'].str.replace(' \(AW\)', '')
df_jky = df1[['Jockey']].copy()
df_jky_code = df1[['Jockey', 'Course']].copy()
df_jky = df_jky.drop_duplicates()
df_jky_code = df_jky_code.drop_duplicates()
df_trn = df2[['Trainer']].copy()
df_trn_trk = df2[['Trainer', 'Course']].copy()
df_trn = df_trn.drop_duplicates()
df_trn_trk = df_trn_trk.drop_duplicates()
#print(df_jky_code)
#print(df_trn_trk)
return df_jky, df_jky_code, df_trn, df_trn_trk
So, it turns out that I needed to refer to the dataframes as a tuple item in the main script e.g. df_jt = df_make.jky_trn()
The new main script code is:
import pandas as pd
import df_make
def main():
df_jt = df_make.jky_trn()
print(df_jt[0])
print(df_jt[1])
print(df_jt[2])
print(df_jt[3])
if name == 'main':
main()

Instantiating a class for text analytics

I’ve found this code, a Python Class which takes a WhatsApp conversation text file processes and generates a Chat class which I can interact with. Things like generate charts, the response matrix etc.:
import re
import time
import pandas as pd
import dateutil
import matplotlib.pyplot as plt
class WppAnalyser:
def open_file(self):
x = open(self.filename,'r')
y = x.read()
content = y.splitlines()
return content
def ismessage(self,str):
patterns = {
"hor1":r'w{3}s{1}[0-9]{1,2},s{1}d{4},s{1}d{2}:d{2}',
"hor2":r'w{3}s{1}[0-9]{1,2},s{1}d{2}:d{2}',
"imp2":r'd{1,2}sw{3}sd{2}:d{2}',
"imp1":r'd{1,2}sw{3}sd{4}sd{2}:d{2}'
}
for key in patterns:
result = re.search(patterns[key], str)
if result and str.count(':') >=2:
name_start = str.find("-")+2
first_colon = str.find(":")
name_end = str.find(":", first_colon+1)
name=str[name_start:name_end]
message=str[name_end+1:]
return [name, message, result.group()]
return ["","",str]
def process(self,content):
j = 1
df = pd.DataFrame(index = range(1, len(content)+1), columns=[ 'Name', 'Message', 'date_string'])
for i in content:
results = self.ismessage(i)
if results[0] != "":
df.ix[j]=results
else:
df.ix[j]['Name']=df.ix[j-1]['Name']
df.ix[j]['date_string']=df.ix[j-1]['date_string']
df.ix[j]['Message']=results[2]
j = j+1
df['Time'] = df['date_string'].map(lambda x: dateutil.parser.parse(x))
df['Day'] = df['date_string'].map(lambda x: dateutil.parser.parse(x).strftime("%a"))
df['Date'] = df['date_string'].map(lambda x:dateutil.parser.parse(x).strftime("%x"))
df['Hour'] = df['date_string'].map(lambda x:dateutil.parser.parse(x).strftime("%H"))
How would I run these functions together, passing self in each function is confusing me. What would a main function looks like here?
I have to instantiate WppAnalyser class, right? So far, I tried this for the first method:
class Chat:
def __init__(self, x, y):
self.x = open("chatPPL.txt", "r")
self.y = y

Python: invalid syntax: <string>, line 1, pos 16

I have developed a code in Python in which -in order to run the program- I need to take some arguments from the command line. But I am getting continuously the same error:
Traceback (most recent call last):
File "<string>", line 1, in <fragment>
invalid syntax: <string>, line 1, pos 16
I have the faintest idea what is wrong with my code. So, I present my code below in case someone could help me:
import QSTK.qstkutil.qsdateutil as du
import QSTK.qstkutil.tsutil as tsu
import QSTK.qstkutil.DataAccess as da
import datetime as dt
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import math
import copy
import QSTK.qstkstudy.EventProfiler as ep
import csv
import sys
import argparse
def readData(li_startDate, li_endDate, ls_symbols):
#Create datetime objects for Start and End dates (STL)
dt_start = dt.datetime(li_startDate[0], li_startDate[1], li_startDate[2])
dt_end = dt.datetime(li_endDate[0], li_endDate[1], li_endDate[2])
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
ldf_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
return [d_data, dt_start, dt_end, dt_timeofday, ldt_timestamps]
def marketsim(cash,orders_file,values_file):
orders = pd.read_csv(orders_file,index_col='Date',parse_dates=True,header=None)
ls_symbols = list(set(orders['X.4'].values))
df_lastrow = len(orders) - 1
dt_start = dt.datetime(orders.get_value(0, 'X.1'),orders.get_value(0, 'X.2'),orders.get_value(0, 'X.3'))
dt_end = dt.datetime(orders.get_value(df_lastrow, 'X.1'),orders.get_value(df_lastrow, 'X.2'),orders.get_value(df_lastrow, 'X.3') + 1 )
#d_data = readData(dt_start,dt_end,ls_symbols)
#Initialize daily timestamp: closing prices, so timestamp should be hours=16 (STL)
dt_timeofday = dt.timedelta(hours=16)
#Get a list of trading days between the start and end dates (QSTK)
ldt_timestamps = du.getNYSEdays(dt_start, dt_end, dt_timeofday)
#Create an object of the QSTK-dataaccess class with Yahoo as the source (QSTK)
c_dataobj = da.DataAccess('Yahoo', cachestalltime=0)
#Keys to be read from the data
ls_keys = ['open', 'high', 'low', 'close', 'volume', 'actual_close']
#Read the data and map it to ls_keys via dict() (i.e. Hash Table structure)
df_data = c_dataobj.get_data(ldt_timestamps, ls_symbols, ls_keys)
d_data = dict(zip(ls_keys, ldf_data))
ls_symbols.append("_CASH")
trades = pd.Dataframe(index=list(ldt_timestamps[0]),columns=list(ls_symbols))
current_cash = cash
trades["_CASH"][ldt_timestamps[0]] = current_cash
current_stocks = dict()
for symb in ls_symbols:
current_stocks[symb] = 0
trades[symb][ldt_timestamps[0]] = 0
for row in orders.iterrows():
row_data = row[1]
current_date = dt.datetime(row_data['X.1'],row_data['X.2'],row_data['X.3'],16)
symb = row_data['X.4']
stock_value = d_data['close'][symb][current_date]
stock_amount = row_data['X.6']
if row_data['X.5'] == "Buy":
current_cash = current_cash - (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] + stock_amount
trades[symb][current_date] = current_stocks[symb]
else:
current_cash = current_cash + (stock_value*stock_amount)
trades["_CASH"][current_date] = current_cash
current_stocks[symb] = current_stocks[symb] - stock_amount
trades[symb][current_date] = current_stocks[symb]
#trades.fillna(method='ffill',inplace=True)
#trades.fillna(method='bfill',inplace=False)
trades.fillna(0)
#alt_cash = current_cash
#alt_cash = trades.cumsum()
value_data = pd.Dataframe(index=list(ldt_timestamps),columns=list("V"))
value_data = value_data.fillna(0)
value_data = value_data.cumsum(axis=0)
for day in ldt_timestamps:
value = 0
for sym in ls_symbols:
if sym == "_CASH":
value = value + trades[sym][day]
else:
value = calue + trades[sym][day]*d_data['close'][sym][day]
value_data["V"][day] = value
fileout = open(values_file,"w")
for row in value_data.iterrows():
file_out.writelines(str(row[0].strftime('%Y,%m,%d')) + ", " + str(row[1]["V"].round()) + "\n" )
fileout.close()
def main(argv):
if len(sys.argv) != 3:
print "Invalid arguments for marketsim.py. It should be of the following syntax: marketsim.py orders_file.csv values_file.csv"
sys.exit(0)
#initial_cash = int (sys.argv[1])
initial_cash = 1000000
ordersFile = str(sys.argv[1])
valuesFile = str(sys.argv[2])
marketsim(initial_cash,ordersFile,valuesFile)
if __name__ == "__main__":
main(sys.argv[1:])
The input I gave to the command line was:
python marketsim.py orders.csv values.csv
I guess that the problem lies either into the imports or probably into the main function(incl. the if below the def main(argv)
I have to point out that the files orders.csv and values.csv exist and are located into the same folder.
I hope have made everything clear.
So, I am looking forward to reading your answers community-mates! :D
Thank you!

Categories