Filtering a pandas data frame - python

Suppose we have a pandas data frame df with a column id with about 5 rows. In the following code below, why do I still get the length of the filtered data frame to be 5:
import pickle
import gzip
import bz2
import pandas as pd
import os
import _pickle as cPickle
import bz2
from downcast import reduce
def load(filename):
"""
Load from filename using pickle
#param filename: name of file to load from
#type filename: str
"""
try:
f = bz2.BZ2File(filename, 'rb')
except:
sys.stderr.write('File ' + filename + ' cannot be read\n')
sys.stderr.write(details)
return
myobj = cPickle.load(f)
f.close()
return myobj
df=pd.DataFrame({"ids":[1,2,3,4,5]})
print(df.shape)
sfile = bz2.BZ2File('df_list_small', 'w')
pickle.dump(df, sfile)
This gives a shape of (5,1) .
df_new= load('df_list_small')
df_new = reduce(df_new)
all_groups = {ident:df_new for ident,df_new in df_new.groupby('ids')}
ids = 1
df_test = all_groups[ids]
print(df_test.shape)
This below gives a shape of (1,1)
So maybe it works only for certain files?

I figured it out. The filtered data frame would have the same dimensions as the original one because they are equal. If I had put a different id, then the dimension of the filtered data frame would have been different.

Related

A DataFrame object does not have an attribute select

In palantir foundry, I am trying to read all xml files from a dataset. Then, in a for loop, I parse the xml files.
Until the second last line, the code runs fine without errors.
from transforms.api import transform, Input, Output
from transforms.verbs.dataframes import sanitize_schema_for_parquet
from bs4 import BeautifulSoup
import pandas as pd
import lxml
#transform(
output=Output("/Spring/xx/datasets/mydataset2"),
source_df=Input("ri.foundry.main.dataset.123"),
)
def read_xml(ctx, source_df, output):
df = pd.DataFrame()
filesystem = source_df.filesystem()
hadoop_path = filesystem.hadoop_path
files = [f"{hadoop_path}/{f.path}" for f in filesystem.ls()]
for i in files:
with open(i, 'r') as f:
file = f.read()
soup = BeautifulSoup(file,'xml')
data = []
for e in soup.select('offer'):
data.append({
'meldezeitraum': e.find_previous('data').get('meldezeitraum'),
'id':e.get('id'),
'parent_id':e.get('parent_id'),
})
df = df.append(data)
output.write_dataframe(sanitize_schema_for_parquet(df))
However, as soon as I add the last line:
output.write_dataframe(sanitize_schema_for_parquet(df))
I get this error:
Missing transform attribute
A DataFrame object does not have an attribute select. Please check the spelling and/or the datatype of the object.
/transforms-python/src/myproject/datasets/mydataset.py
output.write_dataframe(sanitize_schema_for_parquet(df))
What am I doing wrong?
You have to convert your pandas DataFrame to a spark DataFrame. Even though they have the same name those are two different object types in python.
The easiest way to do that is
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df_spark = spark.createDataFrame(df)
You can then pass the spark_df to the output.write_dataframe() function

Loading multiple txt files into list, single column and do not want it delimeted

This code works as is and pulls from each txt/XML file i have, and inserts into snowflake, however when using LoadTXT or GenfromTXT it is adding in brackets [ ] and separating the values in the file. I do not want to separate anything, I want this XML/TXT file to be loaded as is into a single row on a table. Is there something else I can use to stage the data before it makes it into the dataframe, or is there a better process to use to get from txt file to snowflake table.
I have 3 files named : test.xml , test1.xml, test2.xml
contents are the words seen below in a single line with only spaces. Long term this will be full XML files inserted into a single row in the table
import glob
import os
import numpy as np
import snowflake.connector
import pandas as pd
from datetime import datetime
from snowflake.connector.pandas_tools import write_pandas
os.environ["REQUESTS_CA_BUNDLE"] = r'C:\Certificates\cacert.pem'
os.environ["SSL_CERT_FILE"] = r'C:\Certificates\cacert.pem'
ctx = snowflake.connector.connect(
user='email',
password='pass',
account='server',
warehouse='SB',
database='SB',
schema='SB'
)
dated = datetime.today().strftime('%Y-%m-%d')
source_dir = r'C:\Users\jSmith\.spyder-py3\SampleXML'
table_name = 'LV_XML'
file_list = glob.glob(source_dir + '/*.XML')
data = []
for file_path in file_list:
data.append(
np.loadtxt(file_path,dtype='str'))
df = pd.DataFrame(list(zip(data)),
columns =['SRC_XML'])
df["TPR_AS_OF_DT"] = dated
success, nchunks, nrows, _ = write_pandas(ctx, df, table_name,database='SB', schema = 'SB',quote_identifiers=False)
print(str(success) + ', ' + str(nchunks) + ', ' + str(nrows))
ctx.close()
Currently this is what the list looks like:
and this is what it looks like after inserting into the Snowflake:

Compressing files from a Python Dataframe

I have created a dataframe using panda in Python. The dataframe uses two columns from a .csv file called filepath and filename, joins them and then outputs in full.
I am trying to use this output to zip the filename in question but it isn't working properly and just overwrites the file.
import pandas as pd
import zipfile
import os
from os import path
from os.path import basename
column_names = ["Path", "Filename", r"Path"]
df = pd.read_csv(r"resources.csv", usecols= ["Path","Filename"])
df = df.dropna()
df = ["/".join(i) for i in zip(df["Path"].map(str),df["Filename"].map(str))]
rows = list(df)
for row in rows:
print (row)
I added the zipfile.ZipFile entries in the for row in rows block but replaced with print(row) to produce the list.
Can anybody help point me in the right direction.
import pandas as pd
import zipfile
import os
from os import path
from os.path import basename
column_names = ["Path","Filename" r"Path"]
df = pd.read_csv(r"resources.csv", usecols= ["Path","Filename"])
df["fullpath"] = df[["Path","Filename"]].agg("/".join, axis=1)
df["zipfilename"] = df["Filename"].str.replace(r'.py', '')
rows = list(df.values)
for row in rows:
zip = zipfile.ZipFile(row[3] + '.zip', 'w', zipfile.ZIP_DEFLATED)
zip.write(row[2], basename(row[1]))
zip.close()
print (row)
After some head scratching i managed to get exactly what I needed from the dataframe and zip individual files.

How to convert a log file to a pandas DF

I have managed to read the following log file into python:
import os
import glob
import pandas as pd
folder = r'C:\Users\x\x\x\x\\'
for infile in glob.glob(os.path.join(folder, 'console*')):
file = open(infile, 'r').read()
print( file)
print(file) gives me:
John, 1,7,8, text
Matt, 3,7,10, text2
Natasha, 4,60,3,text3
I am hoping to convert into a pandas df:
df = pd.DataFrame(file)
but getting a ValueError: DataFrame constructor not properly called!
Does anyone know how to construct the Dataframe of 3 rows by 5 columns and then add in my own columns headers? Thanks very much!
import os
import glob
import pandas as pd
folder = 'C:\\'
filename2 = [y for y in glob.glob(f'{folder}\\*.*')]
# In the case of .csv files.
df_cc = pd.DataFrame()
for z in filename2:
df = pd.read_csv(z, header = None)
df_cc = df_cc.append(df)

How to read a specific column of csv file using python

I am new at Scikit-Learn and I want to convert a collection of data which I have already labelled into a dataset. I have converted the .csv file of the data into a NumPy array, however one problem I have run into is to classify the data into training set based on the presence of a flag in the second column. I want to know how to access a particular row, column of a .csv file using the Pandas Utility Module. The following is my code:
import numpy as np
import pandas as pd
import csv
import nltk
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from nltk.classify import ClassifierI
from statistics import mode
def numpyfy(fileid):
data = pd.read_csv(fileid,encoding = 'latin1')
#pd.readline(data)
target = data["String"]
data1 = data.ix[1:,:-1]
#print(data)
return data1
def learn(fileid):
trainingsetpos = []
trainingsetneg = []
datanew = numpyfy(fileid)
if(datanew.ix['Status']==1):
trainingsetpos.append(datanew.ix['String'])
if(datanew.ix['Status']==0):
trainingsetneg.append(datanew.ix['String'])
print(list(trainingsetpos))
You can use boolean indexing to split the data. Something like
import pandas as pd
def numpyfy(fileid):
df = pd.read_csv(fileid, encoding='latin1')
target = df.pop('String')
data = df.ix[1:,:-1]
return target, data
def learn(fileid):
target, data = numpyfy(fileid)
trainingsetpos = data[data['Status'] == 1]
trainingsetneg = data[data['Status'] == 0]
print(trainingsetpos)

Categories