Instantiating a class for text analytics

Instantiating a class for text analytics - python

I’ve found this code, a Python Class which takes a WhatsApp conversation text file processes and generates a Chat class which I can interact with. Things like generate charts, the response matrix etc.:
import re
import time
import pandas as pd
import dateutil
import matplotlib.pyplot as plt
class WppAnalyser:
def open_file(self):
x = open(self.filename,'r')
y = x.read()
content = y.splitlines()
return content
def ismessage(self,str):
patterns = {
"hor1":r'w{3}s{1}[0-9]{1,2},s{1}d{4},s{1}d{2}:d{2}',
"hor2":r'w{3}s{1}[0-9]{1,2},s{1}d{2}:d{2}',
"imp2":r'd{1,2}sw{3}sd{2}:d{2}',
"imp1":r'd{1,2}sw{3}sd{4}sd{2}:d{2}'
}
for key in patterns:
result = re.search(patterns[key], str)
if result and str.count(':') >=2:
name_start = str.find("-")+2
first_colon = str.find(":")
name_end = str.find(":", first_colon+1)
name=str[name_start:name_end]
message=str[name_end+1:]
return [name, message, result.group()]
return ["","",str]
def process(self,content):
j = 1
df = pd.DataFrame(index = range(1, len(content)+1), columns=[ 'Name', 'Message', 'date_string'])
for i in content:
results = self.ismessage(i)
if results[0] != "":
df.ix[j]=results
else:
df.ix[j]['Name']=df.ix[j-1]['Name']
df.ix[j]['date_string']=df.ix[j-1]['date_string']
df.ix[j]['Message']=results[2]
j = j+1
df['Time'] = df['date_string'].map(lambda x: dateutil.parser.parse(x))
df['Day'] = df['date_string'].map(lambda x: dateutil.parser.parse(x).strftime("%a"))
df['Date'] = df['date_string'].map(lambda x:dateutil.parser.parse(x).strftime("%x"))
df['Hour'] = df['date_string'].map(lambda x:dateutil.parser.parse(x).strftime("%H"))
How would I run these functions together, passing self in each function is confusing me. What would a main function looks like here?
I have to instantiate WppAnalyser class, right? So far, I tried this for the first method:
class Chat:
def __init__(self, x, y):
self.x = open("chatPPL.txt", "r")
self.y = y

Related

Blank Strings Are Returned in Python Dataframe

I wrote a code to convert PDF to CSV, read the CSV file, and export only relevant information from the CSV file. The function is supposed to return filtered information such as english_name: 'someones name', original_language_name: 'someones name' etc, but instead the command returned english_name: '', original_language_name: '' etc. Below is the code that I wrote:
import pandas as pd
import tabula
from pandas import DataFrame
from backend.classes import Shareholder, Officer
from typing import List
def strip_string(string):
return str(string).strip()
def get_float_without_thousands_separator(string, thousands_separator):
return float(string.replace(thousands_separator, ''))
def extract_officers_and_shareholders_lists_from_df(df, total_number_of_shares, no_data_placeholder, number_of_shares, thousands_separator):
officers = []
shareholders = []
NAME = 'Nama'
POSITION = 'Jabatan'
for row in range((df.shape[0])):
if str(df[POSITION][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
position = strip_string(df[POSITION][row])
officer = Officer(english_name=english_name, original_language_name=original_language_name, position=position)
officers.append(officer)
elif str(df[number_of_shares][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
number_of_shares_string = strip_string(df[number_of_shares][row])
number_of_shares_number = get_float_without_thousands_separator(number_of_shares_string, thousands_separator)
shareholding_percentage = (number_of_shares_number / total_number_of_shares) * 100
shareholder = Shareholder(english_name=english_name, original_language_name=original_language_name, shareholding_percentage=shareholding_percentage)
shareholders.append(shareholder)
return officers, shareholders
def get_officers_and_shareholders_lists(pdf_input_file):
NO_DATA_PLACEHOLDER = '-'
NUMBER_OF_SHARES = 'Jumlah Lembar Saham'
THOUSANDS_SEPARATOR = '.'
output_file_path = 'CSV/Officers_and_Shareholders.csv'
tabula.convert_into(pdf_input_file, output_file_path, output_format='csv', pages='all')
df = pd.read_csv(output_file_path, header=3, on_bad_lines='skip')
all_shares = df[NUMBER_OF_SHARES].to_list()
all_shares_strings = [strip_string(shares) for shares in all_shares if strip_string(shares) != NO_DATA_PLACEHOLDER]
all_shares_numbers = [get_float_without_thousands_separator(shares, THOUSANDS_SEPARATOR) for shares in all_shares_strings]
total_number_of_shares = sum(all_shares_numbers)
return extract_officers_and_shareholders_lists_from_df(
df=df,
total_number_of_shares=total_number_of_shares,
number_of_shares=NUMBER_OF_SHARES,
no_data_placeholder=NO_DATA_PLACEHOLDER,
thousands_separator=THOUSANDS_SEPARATOR)
The command call that I use for the codes on the above is python3 -m backend.officers_and_shareholders. Is there a method to pass in so that english_name returns a name, original_language_name returns a name?

Cant access function in OOP python

For some reason, in my fruit scraper, i cannot access anything from listify function.
I'am getting an error, for exmaple: NameError: name 'family' is not defined.
And i cant figure out what is wrong with my code - is my function is bad, or i'am doing something wrong with class ?
import requests
import json
import random
import pickle
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self, stats):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
return stats
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
It's my first time doing OOP.

Please consider changing this
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
to
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
myScraper.listify()
Fruits_statistics = myScraper.get_summary()
Otherwise you create three different objects of this class and discard them with all their attributes after running the individual method once.
This might also be critical to define family in this line of the code:
Listified_info = myScraper.listify(family)
But I can't see how you intended to use the parameter stats in your method listify(). It is just received and returned. I suggest that you change:
def listify(self, stats):
to
def listify(self):
and remove
return stats
If you want to get those lists inside the object of this class returned by listify(), you may do the following (but this is not OOP way of doing things):
import requests
import json
import copy
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def collect_all_lists(self):
self.allLists = dict('name': self.name, 'id': self.id, 'family': self.family, 'genus': self.genus, 'order': self.order, 'carbohydrates': self.carbohydrates, 'protein': self.protein, 'fat': self.fat, 'calories': self.calories, 'sugar': self.sugar)
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
self.collect_all_lists()
return copy.deepcopy(self.allLists)
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
Listified_info = myScraper.listify()
Fruits_statistics = myScraper.get_summary()

`FAISSDocumentStore` in `haystack` always returns empty results

I am new to haystack and I am using FAISSDocumentStore and EmbeddingRetriever to implement a QA system. This is my code:
from haystack.document_stores import InMemoryDocumentStore, FAISSDocumentStore
from haystack.nodes import TfidfRetriever, DensePassageRetriever, EmbeddingRetriever
from haystack.nodes import FARMReader, TransformersReader
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline
import requests
import pandas as pd
class NeuralSearch:
def __init__(self):
self.HIDDEN_DIMS = 384
self.FAISS_INDEX = "Flat"
self.path = "https://raw.githubusercontent.com/deepset-ai/COVID-QA/master/data/faqs/faq_covidbert.csv"
#property
def document_store(self):
doc_store = FAISSDocumentStore(
embedding_dim=self.HIDDEN_DIMS, faiss_index_factory_str=self.FAISS_INDEX
)
return doc_store
#property
def retriever(self):
retriever = EmbeddingRetriever(
document_store=self.document_store,
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
use_gpu=False,
)
return retriever
def load_data(self):
data = requests.get(self.path)
open("small_faq_covid.csv", "wb").write(data.content)
df = pd.read_csv("small_faq_covid.csv")
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
questions = list(df["question"].values)
df["question_emb"] = self.retriever.embed_queries(texts=questions)
df = df.rename(columns={"question": "content"})
# # Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
self.document_store.write_documents(docs_to_index)
self.document_store.update_embeddings(self.retriever)
self.document_store.save("testfile_path")
ss = FAISSDocumentStore.load(index_path="testfile_path")
# print(docs_to_index)
return docs_to_index, ss
#property
def reader(self):
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
return reader
#property
def pipeline(self):
pipe = ExtractiveQAPipeline(self.reader, self.retriever)
return pipe
def predict(self, query):
prediction = self.pipeline.run(
query=query,
params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}},
)
return prediction
if __name__ == "__main__":
n = NeuralSearch()
q = "What is a novel coronavirus?"
print(n.predict(q))
my problem is that the FAISSDocumentStore always returns empty predictions as follows:
{'answers': [], 'documents': [], 'root_node': 'Query', 'params': {'Retriever': {'top_k': 10}, 'Reader': {'top_k': 5}}, 'query': 'What is a novel coronavirus?', 'node_id': 'Reader'}
How to fix it? and are there any tutorials on how to use FAISSDocumentStore?

Perhaps the problem is that you do not load the base itself here:
ss = FAISSDocumentStore.load(index_path="testfile_path")
Try adding a path to the configuration, it has a path to the base:
document_store = FAISSDocumentStore.load(index_path="file.faiss", config_path="file.json")

How to simplify python code in for loop or another

I have the following development which I'm working with the ElementTree and Pandas module in Python:
import xml.etree.ElementTree as ET
import pandas as pd
file_xml = ET.parse('example1.xml')
rootXML = file_xml.getroot()
def transfor_data_atri(rootXML):
file_xml = ET.parse(rootXML)
data_XML = [
{"Name": signal.attrib["Name"],
# "Value": signal.attrib["Value"]
"Value": int(signal.attrib["Value"].split(' ')[0])
} for signal in file_xml.findall(".//Signal")
]
signals_df = pd.DataFrame(data_XML)
extract_name_value(signals_df)
def extract_name_value(signals_df):
#print(signals_df)
signal_ig_st = signals_df[signals_df.Name.isin(["Status"])]
row_values_ig_st = signal_ig_st.T
vector_ig_st = row_values_ig_st.iloc[[1]]
signal_nav_DSP_rq = signals_df[signals_df.Name.isin(["SetDSP"])]
row_values_nav_DSP_rq = signal_nav_DSP_rq.T
vector_nav_DSP_rq = row_values_nav_DSP_rq.iloc[[1]]
signal_HMI_st = signals_df[signals_df.Name.isin(["HMI"])]
row_values_HMI_st = signal_HMI_st.T
vector_HMI_st = row_values_HMI_st.iloc[[1]]
signal_delay_ac = signals_df[signals_df.Name.isin(["Delay"])]
row_values_delay_ac = signal_delay_ac.T
vector_delay_ac = row_values_delay_ac.iloc[[1]]
signal_AutoConfigO_Rear = signals_df[signals_df.Name.isin(["AutoConfigO_Rear"])]
row_values_AutoConfigO_Rear = signal_AutoConfigO_Rear.T
vector_AutoConfigO_Rear = row_values_AutoConfigO_Rear.iloc[[1]]
signal_ACO_Front = signals_df[signals_df.Name.isin(["AutoConfigO_Front"])]
row_values_ACO_Front = signal_ACO_Front.T
vertor_ACO_Front = row_values_ACO_Front.iloc[[1]]
signal_ACO_Drvr = signals_df[signals_df.Name.isin(["AutoConfigO_Drvr"])]
row_values_ACO_Drvr = signal_ACO_Drvr.T
vector_ACO_Drvr = row_values_ACO_Drvr.iloc[[1]]
signal_ACO_Allst = signals_df[signals_df.Name.isin(["AutoConfigO_Allst"])]
row_values_ACO_Allst = signal_ACO_Allst.T
vector_ACO_Allst = row_values_ACO_Allst.iloc[[1]]
signal_RURRq_st = signals_df[signals_df.Name.isin(["RUResReqstStat"])]
row_values_RURRq_st = signal_RURRq_st.T
vector_RURRq_st = row_values_RURRq_st.iloc[[1]]
signal_RURqSy_st = signals_df[signals_df.Name.isin(["RUReqstrSystem"])]
row_values_RURqSy_st = signal_RURqSy_st.T
vector_RURqSy_st = row_values_RURqSy_st.iloc[[1]]
signal_RUAudS_st = signals_df[signals_df.Name.isin(["RUSource"])]
row_values_RUAudS_st = signal_RUAudS_st.T
vector_RUAudS_st = row_values_RUAudS_st.iloc[[1]]
signal_DSP_st = signals_df[signals_df.Name.isin(["DSP"])]
row_values_DSP = signal_DSP.T
vector_DSP = row_values_DSP.iloc[[1]]
print('1: ', vector_ig_st)
print('2: ', vector_nav_DSP_rq)
print('3: ', vector_HMI_st)
print('4: ', vector_delay_ac)
The output of the above is the following, they are the first 4 prints and it is fine, because it is what they want, but I have to simplify the code, so that any type of xml file of the type example.xml, can be read not only example1.xml:
The simplified code is required to bring the data as it is in the names_list variable, but not to use this variable, which is actually hard-coded:
names_list = [
'Status', 'SetDSP', 'HMI', 'Delay', 'AutoConfigO_Rear',
'AutoConfigO_Front', 'AutoConfigO_Drvr','AutoConfigO_Allst',
'RUResReqstStat', 'RUReqstrSystem', 'RUSource', 'DSP'
]
So when the client wants to put another XML file with the same structure, but with other names that are not in the code, it can read them without problem. Beforehand thank you very much.

I hope I'm understanding the questions correctly. my understanding is that
you want to dynamically produce the extract_name_value() function, and make it not as bulky in your code.
Im sorry, but I failed to comprehend the for i in signal_name: print(i) part of the question. perhaps you can rephrase the question, and help me understand?
my solution to the extract_name_value() part would be using the exec() function.
it is a built-in solution for dynamic execution.
name_list = ['Status', 'SetDSP', 'HMI', 'Delay', 'AutoConfigO_Rear',
'AutoConfigO_Front', 'AutoConfigO_Drvr', 'AutoConfigO_Allst',
'RUResReqstStat', 'RUReqstrSystem', 'RUSource', 'DSP']
def _build_extract_name_value_func(name_list):
extract_name_value_func = ""
for name in name_list:
holder_func = f"""
signal_{name} = signals_df[signals_df.Name.isin([{name}])]
row_values_{name} = signal_{name}.T
vector_{name} = row_values_{name}.iloc[[1]]
vector_list.append(vector_{name})
"""
extract_name_value_func += holder_func
return extract_name_value_func
def extract_name_value(name_list):
extract_name_value_func = build_extract_name_value_func(name_list)
exec(extract_name_value_func)
the code was not tested with actual data, because I am not familiar with handling xml structures. But I hope the python part can be some help to you.

I was able to solve it, I used a for loop and iterated the dataframe itself:
for i in signals_df.Name:
signal = signals_df [signals_df.Name.isin ([i])]
row_values = signal.T
vector = row_values.iloc [[1]]
print (vector)

Why am I getting this NameError?

Here's the error:
File "/Users/KarenLee/Desktop/temp/worldmodel.py", line 76, in update_on_time
obj = VeinAction(entity, image_store)
NameError: global name 'VeinAction' is not defined
And here is my code (this is in the file "actions.py"):
import entities
import worldmodel
import pygame
import math
import random
import point
import image_store
BLOB_RATE_SCALE = 4
BLOB_ANIMATION_RATE_SCALE = 50
BLOB_ANIMATION_MIN = 1
BLOB_ANIMATION_MAX = 3
FREEZE_ANIMATION_RATE = 100
FREEZE_STEPS = 4
ORE_CORRUPT_MIN = 20000
ORE_CORRUPT_MAX = 30000
QUAKE_STEPS = 10
QUAKE_DURATION = 1100
QUAKE_ANIMATION_RATE = 100
VEIN_SPAWN_DELAY = 500
VEIN_RATE_MIN = 8000
VEIN_RATE_MAX = 17000
WYVERN_RATE_MIN = 200
WYVERN_RATE_MAX = 600
WYVERN_ANIMATION_RATE = 100
class VeinAction:
def __init__(self, entity, image_store):
self.entity = entity
self.image_store = image_store
def vein_action(self, world, action, ticks):
entity = self.entity
open_pt = find_open_around(world, entities.get_position(entity),
entities.get_resource_distance(entity))
if open_pt:
ore = create_ore(world,
"ore - " + entities.get_name(entity) + " - " + str(ticks),
open_pt, ticks, action.image_store)
worldmodel.add_entity(world, ore)
tiles = [open_pt]
else:
tiles = []
schedule_action(world, entity, VeinAction(entity, action.image_store),
ticks + entities.get_rate(entity))
return tiles
def vein_take_action(self, world, action, ticks):
entities.remove_pending_action(self.entity, action)
if isinstance(action, VeinAction):
return self.vein_action(world, action, ticks)
And this is in the file "worldmodel.py":
import entities
import pygame
import ordered_list
import actions
import occ_grid
import point
class WorldModel:
def __init__(self, num_rows, num_cols, background):
self.background = occ_grid.Grid(num_cols, num_rows, background)
self.num_rows = num_rows
self.num_cols = num_cols
self.occupancy = occ_grid.Grid(num_cols, num_rows, None)
self.entities = []
self.action_queue = ordered_list.OrderedList()
def update_on_time(world, ticks):
tiles = []
next = world.action_queue.head()
obj = VeinAction(entity, image_store)
while next and next.ord < ticks:
world.action_queue.pop()
tiles.extend(obj.vein_take_action(world, next.item, ticks))
tiles.extend(actions.take_action(world, next.item, ticks))
next = world.action_queue.head()
return tiles
The error message comes from the update_on_time function in "worldmodel.py". I thought that this was how you would call a method from a class in a different file in a function, but it doesn't work! What is the correct way to do this? Or, is it possible to do this? Thanks in advance.

You imported the module actions which contains the class VeinAction. However, Python does not know this. You need to tell Python where VeinAction is located by adding actions. before it:
obj = actions.VeinAction(entity, image_store)
That, or you could import VeinAction directly:
from actions import VeinAction
Either way, you need to make sure that Python can find the class VeinAction.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Instantiating a class for text analytics - python

Related

Blank Strings Are Returned in Python Dataframe

Cant access function in OOP python

`FAISSDocumentStore` in `haystack` always returns empty results

How to simplify python code in for loop or another

Why am I getting this NameError?

Categories

Resources