I took some rows from csv file like this
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
and performed some functions on it. now i want to save it in csv again but it is giving error module 'pandas' has no attribute 'to_csv'
I am trying to save it like this
pd.to_csv(CV_data, sep='\t', encoding='utf-8')
here is my full code. how can i save my resulting data in csv or excel?
# Disable warnings, set Matplotlib inline plotting and load Pandas package
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import pandas as pd
pd.options.display.mpl_style = 'default'
CV_data = sqlContext.read.load('Downloads/data/churn-bigml-80.csv',
format='com.databricks.spark.csv',
header='true',
inferSchema='true')
final_test_data = sqlContext.read.load('Downloads/data/churn-bigml-20.csv',
format='com.databricks.spark.csv',
header='true',
inferSchema='true')
CV_data.cache()
CV_data.printSchema()
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import UserDefinedFunction
binary_map = {'Yes':1.0, 'No':0.0, True:1.0, False:0.0}
toNum = UserDefinedFunction(lambda k: binary_map[k], DoubleType())
CV_data = CV_data.drop('State').drop('Area code') \
.drop('Total day charge').drop('Total eve charge') \
.drop('Total night charge').drop('Total intl charge') \
.withColumn('Churn', toNum(CV_data['Churn'])) \
.withColumn('International plan', toNum(CV_data['International plan'])) \
.withColumn('Voice mail plan', toNum(CV_data['Voice mail plan'])).cache()
final_test_data = final_test_data.drop('State').drop('Area code') \
.drop('Total day charge').drop('Total eve charge') \
.drop('Total night charge').drop('Total intl charge') \
.withColumn('Churn', toNum(final_test_data['Churn'])) \
.withColumn('International plan', toNum(final_test_data['International plan'])) \
.withColumn('Voice mail plan', toNum(final_test_data['Voice mail plan'])).cache()
pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
def labelData(data):
# label: row[end], features: row[0:end-1]
return data.map(lambda row: LabeledPoint(row[-1], row[:-1]))
training_data, testing_data = labelData(CV_data).randomSplit([0.8, 0.2])
model = DecisionTree.trainClassifier(training_data, numClasses=2, maxDepth=2,
categoricalFeaturesInfo={1:2, 2:2},
impurity='gini', maxBins=32)
print (model.toDebugString())
print ('Feature 12:', CV_data.columns[12])
print ('Feature 4: ', CV_data.columns[4] )
from pyspark.mllib.evaluation import MulticlassMetrics
def getPredictionsLabels(model, test_data):
predictions = model.predict(test_data.map(lambda r: r.features))
return predictions.zip(test_data.map(lambda r: r.label))
def printMetrics(predictions_and_labels):
metrics = MulticlassMetrics(predictions_and_labels)
print ('Precision of True ', metrics.precision(1))
print ('Precision of False', metrics.precision(0))
print ('Recall of True ', metrics.recall(1))
print ('Recall of False ', metrics.recall(0))
print ('F-1 Score ', metrics.fMeasure())
print ('Confusion Matrix\n', metrics.confusionMatrix().toArray())
predictions_and_labels = getPredictionsLabels(model, testing_data)
printMetrics(predictions_and_labels)
CV_data.groupby('Churn').count().toPandas()
stratified_CV_data = CV_data.sampleBy('Churn', fractions={0: 388./2278, 1: 1.0}).cache()
stratified_CV_data.groupby('Churn').count().toPandas()
pd.to_csv(CV_data, sep='\t', encoding='utf-8')
to_csv is a method of a DataFrame object, not of the pandas module.
df = pd.DataFrame(CV_data.take(5), columns=CV_data.columns)
# whatever manipulations on df
df.to_csv(...)
You also have a line pd.DataFrame(CV_data.take(5), columns=CV_data.columns) in your code.
This line creates a dataframe and then discards it. Even if you were successfully calling to_csv, none of your changes to CV_data would have been reflected in that dataframe (and therefore in the outputed csv file).
Solution-
You should write df.to_csv instead of pd.to_csv
Justification-
to_csv is a method to an object which is a df (DataFrame); while pd is Panda module.
Hence, your code was not working and throwing this Error "
AttributeError: module 'pandas' has no attribute 'to_csv'"
This will do the job!
#Create a DataFrame:
new_df = pd.DataFrame({'id': [1,2,3,4,5], 'LETTERS': ['A','B','C','D','E'], 'letters': ['a','b','c','d','e']})
#Save it as csv in your folder:
new_df.to_csv('C:\\Users\\You\\Desktop\\new_df.csv')
Related
I am New in pandas and streamlit , What I am trying is to filter such a dataframe using streamlit selectbox
but unfortunately everything is going well except that when changing the filter value it does not reflect on the shown table
as you could see the name in the filter does not update the table
here is the code I have used:
import xlrd
import pandas as pd
import os
from datetime import datetime
import streamlit as st
# import plotly_express as px
# to refer to the file
# change the current directory
currentDir = os.chdir('C:\\Users\\user\\Desktop\\lists');
files=os.listdir(currentDir)
columns=['Name','status','memorize-from-surah','memorize-from-ayah','memorize-to-surah','memorize-to-ayah','memorization-grade','words-meaning-grade','revision-from-surah','revision-from-ayah','revision-to-surah','revision-to-ayah','revision-grade']
folderDF=pd.DataFrame()
for file in files:
# get the file name without extension for the sheikh name
sheikh=os.path.splitext(file)[0]
sheetDF=pd.DataFrame()
workbook = pd.ExcelFile(f'C:\\users\\user\\Desktop\\lists\\{file}')
sheets_numbers = len(workbook.sheet_names)
print(sheets_numbers)
for i in range(1, sheets_numbers-1):
# print(workbook.sheet_by_index(i).name)
current_sheet = pd.read_excel(file,sheet_name=i,header=None,index_col=1)
date= current_sheet.iloc[6, 10]
# for j in range(7,current_sheet.nrows):
# if current_sheet.cell(j,3).value=="غاب" or current_sheet.cell(j,3).value=="عذر":
# for k in range(4,current_sheet.ncols):
# current_sheet.cell(j,k).value=""
sheets=pd.read_excel(file,sheet_name=i,skiprows=11,header=None,index_col=1)
# df = pd.DataFrame(sheets.iloc[:,1:], index=index)
#remove the first col
df=pd.DataFrame(sheets.iloc[:,1:])
#remove empty rows
df=df[df.iloc[:,0].notna()]
#rename the columns
df.columns = columns
#get the nrows
nrows= len(df.index)
sheikhCol=pd.Series(nrows*sheikh)
dateCol=pd.Series(nrows*date)
halkaCol=pd.Series(nrows*i)
# df.insert(1,"sheikh",sheikhCol)
df.insert(1,"halka",halkaCol)
df.insert(2,"sheikh",sheikhCol)
df.insert(3,"date",dateCol)
df["sheikh"]=sheikh
df['date']=date
df['halka']=i
if i == 1:
sheetDF=pd.DataFrame(df)
datatoexcel = pd.ExcelWriter('C:\\users\\user\\Desktop\\dataOut.xlsx')
sheetDF.to_excel(datatoexcel)
datatoexcel.save()
else:
sheetDF = pd.concat([sheetDF, df], axis=0)
folderDF=pd.concat([folderDF,sheetDF],axis=0)
datatoexcel=pd.ExcelWriter('C:\\users\\user\\Desktop\\dataOut.xlsx')
folderDF.to_excel(datatoexcel)
datatoexcel.save()
#
# setting up the streamlit page
st.set_page_config(page_title='makraa reports',layout='wide')
# make filteration
#
st.sidebar.header("make filtration criteria")
nameFilter= folderDF['Name'].unique()
halkaFilter= folderDF['halka'].unique()
sheikhFilter= folderDF['sheikh'].unique()
student_choice= st.sidebar.selectbox("select the student Name",nameFilter)
halka_choice= st.sidebar.selectbox("select the halka Number",halkaFilter)
sheikh_choice= st.sidebar.selectbox("select the sheikh Number",sheikhFilter)
# student_choice2= st.sidebar.multiselect("select the student Name",options=nameFilter,default=nameFilter)
# filteredDf=folderDF[folderDF["Name"]== student_choice]
filteredDf = folderDF[(folderDF["Name"] == student_choice) & (folderDF["halka"] == halka_choice)]
# filteredDf=folderDF.query('Name==#student_choice')
st.write(filteredDf)
note st.dataframe(filteredDf) does not make any difference
the streamlit version I used is 0.75 , since the recent version gave me the StreamlitAPIException like that enter link description here
could you give a hand in this
Here is a sample code with example data.
Code
import streamlit as st
import pandas as pd
data = {
'Name': ['a', 'b', 'c'],
'halka': [1, 2, 3]
}
st.set_page_config(page_title='makraa reports',layout='wide')
folderDF = pd.DataFrame(data)
# make filteration
#
st.sidebar.header("make filtration criteria")
nameFilter = folderDF['Name'].unique()
halkaFilter = folderDF['halka'].unique()
# sheikhFilter = folderDF['sheikh'].unique()
student_choice = st.sidebar.selectbox("select the student Name", nameFilter)
halka_choice = st.sidebar.selectbox("select the halka Number", halkaFilter)
# sheikh_choice= st.sidebar.selectbox("select the sheikh Number",sheikhFilter)
# student_choice2= st.sidebar.multiselect("select the student Name",options=nameFilter,default=nameFilter)
filteredDf = folderDF[(folderDF["Name"] == student_choice) & (folderDF["halka"] == halka_choice)]
# filteredDf = filteredDf[filteredDf["halka"] == halkaFilter]
st.write(filteredDf)
Output
My current database is:
# bibliotecas necessárias
import pandas as pd
dict_noticia = {'nome_adm': ['CC Brasil',
'ABC Futuro Esporte',
'Tabuao'],
'noticia': ["['folha', 'paulo', 'https', 'east', 'amazonaws', 'multclipp', 'arquivos', 'noticias', 'pdf', 'jpg', 'mônica', 'bergamo', 'longo', 'tempo']",
"['coluna', 'estadão']",
"['flamengo', 'futebol','melhor','campeao','é']"]
}
df = pd.DataFrame(dict_noticia)
df
I need a new column with the lemmas of the "news" column.
The script below gives error:
import stanza
nlp_stanza = stanza.Pipeline(lang='pt', processors='tokenize,mwt,pos,lemma')
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(row)]))
return df
f_lematizacao_stanza(data,'noticia','noticia_lema')
NameError: name 'row' is not defined
How to solve
Thank you in advance.
You have not defined the variable row. You need to use x:
def f_lematizacao_stanza(df,column_name,new_column_name):
df[new_column_name] = df[column_name].apply(lambda x: ([w.lemma_ for w in nlp_stanza(x)]))
return df
My python script for some reason is working in MAC OSX, and in raspbian buster (Yes, I tried it in a raspberry in a moment of desperation) but it is not working in Ubuntu 18 SO That I use in my main PC. I have even tried a fresh install of Ubuntu Mate 20 in other PC, but it is still not working.
This is the script:
import sys
import csv
from http.client import IncompleteRead
import pandas as pd
from Bio import Entrez
Entrez.email = ""
# get from WPs accession, corresponding assembly, NC IDs, strains names. Write a csv table with all these as final data tablee,
#+ a table with WPs and Assembly IDs for inputting in FLAG
list_of_accession = []
with open (sys.argv[1], 'r') as csvfile:
efetchin=csv.reader(csvfile, delimiter = ',')
for row in efetchin:
list_of_accession.append(str(row[0]))
with open('efetch_output.txt', mode = 'w') as efetch_output:
efetch_output = csv.writer(efetch_output, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
efetch_output.writerow(['ID','Source', 'Nucleotide Accession', 'Start', 'Stop', 'Strand', 'Protein', 'Protein Name', 'Organism', ' Strain', 'Assembly'])
input_handle = Entrez.efetch(db="protein", id= list_of_accession, rettype="ipg", retmode="tsv")
for line in input_handle:
print(line, file=open('efetch_output.txt','a'))
input_handle.close()
#process file in pandas
file_name = "efetch_output.txt"
file_name_output = "final_output.tsv"
df = pd.read_csv(file_name, sep="\t", low_memory=False)
# Get names of indexes for which rows have to be dropped
indexNames = df[ df['Source'] == 'INSDC'].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
#rearrange table columns
df = df[['ID', 'Source', 'Nucleotide Accession', 'Protein', 'Protein Name', 'Start', 'Stop', 'Strand', 'Organism',' Strain', 'Assembly']]
#Sort table on Assembly number ignoring GCF_
df['sort'] = df['Assembly'].str.extract('(\d+)', expand=False).astype(str)
df.sort_values('sort',inplace=True, ascending=True)
df = df.drop('sort', axis=1)
#drop all duplicates that're similar in indicated subset fields
df3=df.drop_duplicates(subset=['Start', 'Stop', 'Strand', 'Organism',' Strain', 'Assembly'],keep='first')
#sorts dataframe alphabetically by Organism and writes to csv
df3.sort_values(by = "Organism", axis=0, ascending=True, inplace=False).to_csv("final_parsed_output.tsv", "\t", index=False)
#get WP_X and GFC_X IDs in a tsv to input in FLAGs
new_dataframe1 = df3[['Assembly', 'Protein']]
new_dataframe2 = df3[['Organism',' Strain', 'Assembly', 'Protein']]
new_dataframe1.sort_values(by = "Protein", axis=0, ascending=True, inplace=False).to_csv('flags_input.tsv', '\t', header=False, columns = ['Assembly', 'Protein'])
new_dataframe2.sort_values(by = "Organism", axis=0, ascending=True, inplace=False).to_csv('flags_input_wstrains.tsv', '\t', header=False, columns = ['Organism',' Strain', 'Assembly', 'Protein'])
print ('program finished')
I do not know if I can upload here a csv as an example that you can use. But they are basically a list of proteins in a csv like this:
WP_047566605.1 WP_043586512.1 WP_086526429.1 WP_043669791.1
WP_086513259.1 WP_086518190.1 WP_053774664.1 WP_012298127.1
WP_063071144.1 WP_012038522.1 WP_066595335.1 WP_088456184.1
WP_058743206.1 WP_042537210.1 WP_058724426.1
The error that I got in ubuntu mate 20 is:
jj#p4:~/Documents/Bioinformatica/Bioinformatic/August/Codes/Etna$ python3 etna.py JJTEST.csv
/usr/local/lib/python3.8/dist-packages/pandas/core/computation/expressions.py:68: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
return op(a, b)
Traceback (most recent call last):
File "etna.py", line 44, in <module>
df['sort'] = df['Assembly'].str.extract('(\d+)', expand=False).astype(str)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/generic.py", line 5126, in __getattr__
return object.__getattribute__(self, name)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/accessor.py", line 187, in __get__
accessor_obj = self._accessor(obj)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/strings.py", line 2100, in __init__
self._inferred_dtype = self._validate(data)
File "/usr/local/lib/python3.8/dist-packages/pandas/core/strings.py", line 2157, in _validate
raise AttributeError("Can only use .str accessor with string values!")
AttributeError: Can only use .str accessor with string values!
I do not understand fully what was the problem, but I have modified the output files from txt to csv, and change de tsv str to float. Now it is working.
temp = Window.partitionBy("id").orderBy("time").rowsBetween(-5, 5)
spark_df.withColumn("movingAvg",fn.avgspark_df("average")).over(temp)).show()
I'm getting this error in the last line .
dataframe object is not callable
You are missing a bracket, but it also seems some of the syntax is wrong. I assume this is what your code was before the bracket got missing:
fn.avgspark_df("average")
Which is why you get the error; you are trying to call the DataFrame as a function. I believe you can achieve what you want with:
import pyspark.sql.functions as fn
from pyspark.sql import Window
df = pd.DataFrame({'id': [0,0,0,0,0,1,1,1,1,1],
'time': [1,2,3,4,5,1,2,3,4,5],
'average':[0,1,2,3,4,5,6,7,8,9] })
df = sqlContext.createDataFrame(df)
temp = Window.partitionBy("id").orderBy("time").rowsBetween(-1, 1)
df.withColumn("movingAvg",fn.avg("average").over(temp)).show()
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import max,min,avg
spark = SparkSession.builder.appName("Data Frame Example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
l=[("Alice", "2016-05-01", 50.00),
("Alice", "2016-05-03", 45.00),
("Alice", "2016-05-04", 55.00),
("Bob", "2016-05-01", 25.00),
("Bob", "2016-05-04", 29.00),
("Bob", "2016-05-06", 27.00)]
customers = spark.sparkContext.parallelize(l).toDF(["name", "date", "amountSpent"])
temp = Window.partitionBy("name").orderBy("date")
customers.withColumn( "movingAvg",avg("amountSpent").over(temp)).show()
Imports modules:
import Quandl
import pandas as pd
from pandas.tools.plotting import df_unique
read api key:
api_key = open('quandlapikey.txt','r').read()
Currently the function reads a csv file to get the codes however I plan to change this to sqllite..
def stock_list():
#stocks = pd.read_csv('TID.csv'.rstrip())
stocks = open('TID.csv').readlines()
return stocks[0:]
Get stock codes from quandl this works a treat.
def getStockValues():
stocks = stock_list()
main_df = pd.DataFrame()
for abbrv in stocks:
query = "LSE/" + str(abbrv).strip()
df = Quandl.get(query, authtoken=api_key,start_date='2016-04-05', end_date='2016-04-10')
df = df['Price']
df.columns = [abbrv]
print(query)
print(df)
This statement causes the issues for some reason whilst looping it cannot join additional stock prices.
#This statement Prints as
print(df.tail(5))
#causes error
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df)
# exit
print('Task done!')
getStockValues()
This is the output from the print statements and error from the join.
Result:
LSE/VOD
Date
2016-04-14 226.80
2016-04-15 229.75
<ETC for all stocks>
Traceback (most recent call last):
File "H:\Workarea\DataB\SkyDriveP\OneDrive\PyProjects\Learning\21 myPprojects\stockPrices.py", line 49, in <module>
getStockValues()
File "H:\Workarea\DataB\SkyDriveP\OneDrive\PyProjects\Learning\21 myPprojects\stockPrices.py", line 43, in getStockValues
main_df = main_df.join(df)
File "H:\APPS\Python35-32\lib\site-packages\pandas\core\generic.py", line 2669, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'join'
Further tests show that the issue seems to be with the scope of the pandas data object this causes and issue:
main_df = pd.DataFrame()
for abbrv in stocks:
query = "LSE/" + str(abbrv).strip()
df = Quandl.get(query, authtoken=api_key,start_date='2016-03-05', end_date='2016-04-10')
df = df['Price']
df.columns = [abbrv]
#causes error
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df)
However this does not cause an error however only returns one dataset:
for abbrv in stocks:
main_df = pd.DataFrame()
query = "LSE/" + str(abbrv).strip()
df = Quandl.get(query, authtoken=api_key,start_date='2016-03-05', end_date='2016-04-10')
df = df['Price']
df.columns = [abbrv]
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df)
Seems to me that the issue with your code is somewhere around here:
...
df = df['Price'] ## <- you are turning the DataFrame to a Series here
df.columns = [abbrv] ## <- no effect whatsoever on a Series
print(query)
print(df)
What I would do instead is simply add the new row to your existing DataFrame.
## if main_df.empty: ## <- remove this line
## main_df = df ## This should be changed to the line below
main_df[abbrv] = df ## This will just add the new column to you df and use the Series as data
## else: ## <- remove this line
## main_df = main_df.join(df) ## <- remove this line