How to use lemmatization with the stanza library with dataframe in python? - python

My current database is:
# bibliotecas necessárias
import pandas as pd
dict_noticia = {'nome_adm': ['CC Brasil',
'ABC Futuro Esporte',
'Tabuao'],
'noticia': ["['folha', 'paulo', 'https', 'east', 'amazonaws', 'multclipp', 'arquivos', 'noticias', 'pdf', 'jpg', 'mônica', 'bergamo', 'longo', 'tempo']",
"['coluna', 'estadão']",
"['flamengo', 'futebol','melhor','campeao','é']"]
}
df = pd.DataFrame(dict_noticia)
df
I need a new column with the lemmas of the "news" column.
The script below gives error:
import stanza
nlp_stanza = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(row)]))
return df
f_lematizacao_stanza(data,'noticia','noticia_lema')
NameError: name 'row' is not defined
How to solve
Thank you in advance.

You have not defined the variable row. You need to use x:
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(x)]))
return df

Related

Python groupby output

I'm trying to take a spreadsheet input and display only the urls that are CMS related (wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla). I'm trying to get the output to be "technologies" and then the "url" associated (grouped) to those urls.
Link to data file
output
Code is:
import pandas as pd
import numpy as np
dataset="ASD-example.xlsx"
term_cms = 'wordpress|Wordpress|WordPress|Drupal|drupal|Joomla|joomla'
df = pd.read_excel((dataset), sheet_name="HTTPX")
df['technology_count'] = df.groupby('technologies')['url'].transform('count')
df.drop(['timestamp', 'request', 'response-header', 'scheme', 'port', 'body-sha256','header-sha256', 'a', 'cnames', 'input', 'location', 'error', 'response-body', 'content-type','method', 'host', 'content-length', 'chain-status-codes', 'status-code', 'tls-grab', 'csp', 'vhost','websocket', 'pipeline', 'http2', 'cdn', 'response-time', 'chain', 'final-url', 'failed','favicon-mmh3', 'lines', 'words','path','webserver'],inplace=True,axis=1)
df[df['technologies'].str.contains(term_cms, na=False)]
pivot1 = pd.pivot_table(df, index=['technologies', 'url'], columns=None, fill_value=0)
print(pivot1)
I cleaned your code a bit to make it more readable to get the output you want.
term_cms = ["wordpress", "drupal", "joomla"]
# remove square brackets and lowercase all names
df['technologies'] = df['technologies'].str.strip('[]')
df['technologies'] = df['technologies'].str.lower()
# include only needed technologies
mask = df['technologies'].isin(term_cms)
df = df[mask]
# groupby and count
df = df.groupby(['technologies', 'url']).size().reset_index(name='technology_count')
Output:
technologies URL technology_count
0 joomla https://testcom123. 1
1 Wordpress https://test.com:443 1

The streamlit does not refresh the dataframe on the localhost

I am New in pandas and streamlit , What I am trying is to filter such a dataframe using streamlit selectbox
but unfortunately everything is going well except that when changing the filter value it does not reflect on the shown table
as you could see the name in the filter does not update the table
here is the code I have used:
import xlrd
import pandas as pd
import os
from datetime import datetime
import streamlit as st
# import plotly_express as px
# to refer to the file
# change the current directory
currentDir = os.chdir('C:\\Users\\user\\Desktop\\lists');
files=os.listdir(currentDir)
columns=['Name','status','memorize-from-surah','memorize-from-ayah','memorize-to-surah','memorize-to-ayah','memorization-grade','words-meaning-grade','revision-from-surah','revision-from-ayah','revision-to-surah','revision-to-ayah','revision-grade']
folderDF=pd.DataFrame()
for file in files:
# get the file name without extension for the sheikh name
sheikh=os.path.splitext(file)[0]
sheetDF=pd.DataFrame()
workbook = pd.ExcelFile(f'C:\\users\\user\\Desktop\\lists\\{file}')
sheets_numbers = len(workbook.sheet_names)
print(sheets_numbers)
for i in range(1, sheets_numbers-1):
# print(workbook.sheet_by_index(i).name)
current_sheet = pd.read_excel(file,sheet_name=i,header=None,index_col=1)
date= current_sheet.iloc[6, 10]
# for j in range(7,current_sheet.nrows):
# if current_sheet.cell(j,3).value=="غاب" or current_sheet.cell(j,3).value=="عذر":
# for k in range(4,current_sheet.ncols):
# current_sheet.cell(j,k).value=""
sheets=pd.read_excel(file,sheet_name=i,skiprows=11,header=None,index_col=1)
# df = pd.DataFrame(sheets.iloc[:,1:], index=index)
#remove the first col
df=pd.DataFrame(sheets.iloc[:,1:])
#remove empty rows
df=df[df.iloc[:,0].notna()]
#rename the columns
df.columns = columns
#get the nrows
nrows= len(df.index)
sheikhCol=pd.Series(nrows*sheikh)
dateCol=pd.Series(nrows*date)
halkaCol=pd.Series(nrows*i)
# df.insert(1,"sheikh",sheikhCol)
df.insert(1,"halka",halkaCol)
df.insert(2,"sheikh",sheikhCol)
df.insert(3,"date",dateCol)
df["sheikh"]=sheikh
df['date']=date
df['halka']=i
if i == 1:
sheetDF=pd.DataFrame(df)
datatoexcel = pd.ExcelWriter('C:\\users\\user\\Desktop\\dataOut.xlsx')
sheetDF.to_excel(datatoexcel)
datatoexcel.save()
else:
sheetDF = pd.concat([sheetDF, df], axis=0)
folderDF=pd.concat([folderDF,sheetDF],axis=0)
datatoexcel=pd.ExcelWriter('C:\\users\\user\\Desktop\\dataOut.xlsx')
folderDF.to_excel(datatoexcel)
datatoexcel.save()
#
# setting up the streamlit page
st.set_page_config(page_title='makraa reports',layout='wide')
# make filteration
#
st.sidebar.header("make filtration criteria")
nameFilter= folderDF['Name'].unique()
halkaFilter= folderDF['halka'].unique()
sheikhFilter= folderDF['sheikh'].unique()
student_choice= st.sidebar.selectbox("select the student Name",nameFilter)
halka_choice= st.sidebar.selectbox("select the halka Number",halkaFilter)
sheikh_choice= st.sidebar.selectbox("select the sheikh Number",sheikhFilter)
# student_choice2= st.sidebar.multiselect("select the student Name",options=nameFilter,default=nameFilter)
# filteredDf=folderDF[folderDF["Name"]== student_choice]
filteredDf = folderDF[(folderDF["Name"] == student_choice) & (folderDF["halka"] == halka_choice)]
# filteredDf=folderDF.query('Name==#student_choice')
st.write(filteredDf)
note st.dataframe(filteredDf) does not make any difference
the streamlit version I used is 0.75 , since the recent version gave me the StreamlitAPIException like that enter link description here
could you give a hand in this
Here is a sample code with example data.
Code
import streamlit as st
import pandas as pd
data = {
'Name': ['a', 'b', 'c'],
'halka': [1, 2, 3]
}
st.set_page_config(page_title='makraa reports',layout='wide')
folderDF = pd.DataFrame(data)
# make filteration
#
st.sidebar.header("make filtration criteria")
nameFilter = folderDF['Name'].unique()
halkaFilter = folderDF['halka'].unique()
# sheikhFilter = folderDF['sheikh'].unique()
student_choice = st.sidebar.selectbox("select the student Name", nameFilter)
halka_choice = st.sidebar.selectbox("select the halka Number", halkaFilter)
# sheikh_choice= st.sidebar.selectbox("select the sheikh Number",sheikhFilter)
# student_choice2= st.sidebar.multiselect("select the student Name",options=nameFilter,default=nameFilter)
filteredDf = folderDF[(folderDF["Name"] == student_choice) & (folderDF["halka"] == halka_choice)]
# filteredDf = filteredDf[filteredDf["halka"] == halkaFilter]
st.write(filteredDf)
Output

Search pandas by index 'INT'

I'm trying to create a search but I'm facing an error, according to some tests I can search for 'name', but I would like to search for 'number_order', does anyone have a solution? Remembering that 'number_order' cannot be changed inside the dataframe EX: 'number_order' : [202204000001] -> 'number_order' : ['202204000001']
import pandas as pd
import matplotlib.pyplot as plt
d = {'number_order' : [202204000001, 202204000002, 202204000003, 202204000004,
202204000005, 202204000006],
'client' : ['Roger Nascimento', 'Rodrigo Peixato', 'Pedro',
'Rafael', 'Maria', 'Emerson'],
'value' : ['120', '187.74', '188.7', '300', '563.2', '198.0']
}
df = pd.DataFrame(data = d)
src_field_data = '202004'
filtered_data = df['number_order']
filtered_data = df.loc[filtered_data.str.contains(f'^{src_field_data}', case = False)]
print(f'number_order FILTERED {filtered_data}\n')
I want to search like this example below, using only a part of the text:
import pandas as pd
import matplotlib.pyplot as plt
d = {'number_order' : [202204000001, 202204000002, 202204000003, 202204000004,
202204000005, 202204000006],
'client' : ['Roger Nascimento', 'Rodrigo Peixato', 'Pedro',
'Rafael', 'Maria', 'Emerson'],
'value' : ['120', '187.74', '188.7', '300', '563.2', '198.0']
}
df = pd.DataFrame(data = d)
src_field_data = 'R'
filtered_data = df['client']
filtered_data = df.loc[filtered_data.str.contains(f'^{src_field_data}', case = False)]
print(f'number_order FILTERED {filtered_data}\n')
Convert values to strings:
filtered_data = df.loc[filtered_data.astype(str).str.contains(f'^{src_field_data}', case = False)]

How to get a list of the name of every open window and place that into dataframe?

So I'm trying to use both win32gui and Pandas to get a dataframe (df) of windows that are open. Below is what I wrote. I end up with an error. How can I get one dataframe returned?
# info http://timgolden.me.uk/pywin32-docs/win32gui__EnumWindows_meth.html
import win32gui
import pandas as pd
def winEnumHandler( hwnd, dfx ):
if win32gui.IsWindowVisible( hwnd ) and len(win32gui.GetWindowText( hwnd ))>0 :
idv = hex(hwnd)
winv = win32gui.GetWindowText(hwnd)
df = pd.DataFrame({'ID' : idv , 'Window': winv}, index = ['0'])
frames = [dfx, df]
dfx = pd.concat(frames)
# print(dfx)
return dfx # Comment out this and it runs but not the result I want.
dfx= pd.DataFrame() # empty dataframe
win32gui.EnumWindows( winEnumHandler, dfx )
print(dfx)
Traceback
Traceback (most recent call last):
File "c:\Users\s...\Python\List of windows.py", line 19, in <module>
win32gui.EnumWindows( winEnumHandler, dfx )
TypeError: an integer is required (got type DataFrame)
So the key to getting the dataframe out of the function is to use a global variable. This variable must be state as global inside the function so there is not confusion and python does not consider it as local variable. Here is the code.
import win32gui
import pandas as pd
dfx = pd.DataFrame()
i = 0
def winEnumHandler( hwnd, x ):
global dfx, i
if win32gui.IsWindowVisible( hwnd ) and len(win32gui.GetWindowText( hwnd ))>0 :
idv = hex(hwnd)
winv = win32gui.GetWindowText(hwnd)
df = pd.DataFrame({'ID' : idv , 'Window': winv}, index = [i])
frames = [dfx, df]
dfx = pd.concat(frames)
i += 1
win32gui.EnumWindows( winEnumHandler, i )
print(dfx)

AttributeError: module 'pandas' has no attribute 'to_csv'

I took some rows from csv file like this
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
and performed some functions on it. now i want to save it in csv again but it is giving error module 'pandas' has no attribute 'to_csv'
I am trying to save it like this
pd.to_csv(CV_data, sep='\t', encoding='utf-8')
here is my full code. how can i save my resulting data in csv or excel?
# Disable warnings, set Matplotlib inline plotting and load Pandas package
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import pandas as pd
pd.options.display.mpl_style = 'default'
CV_data = sqlContext.read.load('Downloads/data/churn-bigml-80.csv',
format='com.databricks.spark.csv',
header='true',
inferSchema='true')
final_test_data = sqlContext.read.load('Downloads/data/churn-bigml-20.csv',
format='com.databricks.spark.csv',
header='true',
inferSchema='true')
CV_data.cache()
CV_data.printSchema()
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import UserDefinedFunction
binary_map = {'Yes':1.0, 'No':0.0, True:1.0, False:0.0}
toNum = UserDefinedFunction(lambda k: binary_map[k], DoubleType())
CV_data = CV_data.drop('State').drop('Area code') \
.drop('Total day charge').drop('Total eve charge') \
.drop('Total night charge').drop('Total intl charge') \
.withColumn('Churn', toNum(CV_data['Churn'])) \
.withColumn('International plan', toNum(CV_data['International plan'])) \
.withColumn('Voice mail plan', toNum(CV_data['Voice mail plan'])).cache()
final_test_data = final_test_data.drop('State').drop('Area code') \
.drop('Total day charge').drop('Total eve charge') \
.drop('Total night charge').drop('Total intl charge') \
.withColumn('Churn', toNum(final_test_data['Churn'])) \
.withColumn('International plan', toNum(final_test_data['International plan'])) \
.withColumn('Voice mail plan', toNum(final_test_data['Voice mail plan'])).cache()
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
def labelData(data):
# label: row[end], features: row[0:end-1]
return data.map(lambda row: LabeledPoint(row[-1], row[:-1]))
training_data, testing_data = labelData(CV_data).randomSplit([0.8, 0.2])
model = DecisionTree.trainClassifier(training_data, numClasses=2, maxDepth=2,
categoricalFeaturesInfo={1:2, 2:2},
impurity='gini', maxBins=32)
print (model.toDebugString())
print ('Feature 12:', CV_data.columns[12])
print ('Feature 4: ', CV_data.columns[4] )
from pyspark.mllib.evaluation import MulticlassMetrics
def getPredictionsLabels(model, test_data):
predictions = model.predict(test_data.map(lambda r: r.features))
return predictions.zip(test_data.map(lambda r: r.label))
def printMetrics(predictions_and_labels):
metrics = MulticlassMetrics(predictions_and_labels)
print ('Precision of True ', metrics.precision(1))
print ('Precision of False', metrics.precision(0))
print ('Recall of True ', metrics.recall(1))
print ('Recall of False ', metrics.recall(0))
print ('F-1 Score ', metrics.fMeasure())
print ('Confusion Matrix\n', metrics.confusionMatrix().toArray())
predictions_and_labels = getPredictionsLabels(model, testing_data)
printMetrics(predictions_and_labels)
CV_data.groupby('Churn').count().toPandas()
stratified_CV_data = CV_data.sampleBy('Churn', fractions={0: 388./2278, 1: 1.0}).cache()
stratified_CV_data.groupby('Churn').count().toPandas()
pd.to_csv(CV_data, sep='\t', encoding='utf-8')
to_csv is a method of a DataFrame object, not of the pandas module.
df = pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
# whatever manipulations on df
df.to_csv(...)
You also have a line pd.DataFrame(CV_data.take(5), columns=CV_data.columns) in your code.
This line creates a dataframe and then discards it. Even if you were successfully calling to_csv, none of your changes to CV_data would have been reflected in that dataframe (and therefore in the outputed csv file).
Solution-
You should write df.to_csv instead of pd.to_csv
Justification-
to_csv is a method to an object which is a df (DataFrame); while pd is Panda module.
Hence, your code was not working and throwing this Error "
AttributeError: module 'pandas' has no attribute 'to_csv'"
This will do the job!
#Create a DataFrame:
new_df = pd.DataFrame({'id': [1,2,3,4,5], 'LETTERS': ['A','B','C','D','E'], 'letters': ['a','b','c','d','e']})
#Save it as csv in your folder:
new_df.to_csv('C:\\Users\\You\\Desktop\\new_df.csv')

Categories