Removing text from field in Mongodb with Python - python

I have a CSV file that has a field/column with a comma (","). I load this CSV into mongodb for data manipulation. I would like to strip all text from the comma to the right, leaving only the text to the left of the comma.
What is the most efficient method of accomplishing this task? In my mongodb csv import script (I utilize pandas)? Afterward when the data is already in MongoDB? Honestly, I'm new to programming and would like to know how to do it in either scenario, but I would like to see a solution for which is most efficient.
Here's my csv to python import script:
#!/usr/bin/env python
import sys
import os
import pandas as pd
import pymongo
import json
def import_content(filepath):
mng_client = pymongo.MongoClient('localhost', 27017)
mng_db = mng_client['swx_inv']
collection_name = 'device.switch'
db_cm = mng_db[collection_name]
cdir = os.path.dirname(__file__)
file_res = os.path.join(cdir, filepath)
data = pd.read_csv(file_res, skiprows=2, skip_footer=1)
data_json = json.loads(data.to_json(orient='records'))
db_cm.remove()
db_cm.insert(data_json)
if __name__ == "__main__":
filepath = '/vagrant/data/DeviceInventory-Category.Switch.csv'
import_content(filepath)
Here are the top three rows of the CSV for reference. I'm trying to alter the last field, "OS Image":
Device,Serial Number,Realm,Vendor,Model,OS Image
ABBNWX0100,SMG3453ESDN,BlAH BLAH,Cisco,WS-C6509-E,"IOS 12.2(33)SXI9, s72033_rp-ADVENTERPRISEK9_WAN-M"
ABBNWX0101,SDG127343S0,BLAH BLAH,Cisco,WS-C4506-E,"IOS 12.2(53)SG8, cat4500-IPBASEK9-M"
ABBNWX0102,TREFDSFY1KK,BLAH BLAH,Cisco,WS-C3560V2-48PS-S,"IOS 12.2(55)SE5, C3560-IPBASEK9-M"
EDIT: I found a method to do what I needed via pandas prior to uploading to the mongoDB collection. I have to do this twice, as the save column data uses two different delimiters and a regex would not work properly:
# Use pandas to read CSV, skipping top 2 lines & footer line from
# CSV export. Set column data to string type.
data = pd.read_csv(
file_res, index_col=False, skiprows=2,
skip_footer=1, dtype={'Device': str, 'Serial Number': str,
'Realm': str, 'Vendor': str, 'Model': str,
'OS Image': str}
)
# Drop rows where Serial Number is empty
data = data.dropna(subset=['Serial Number'])
# Split the OS Image column by "," and ";" to remove extraneous data
data['OS Image'].update(data['OS Image'].apply(
lambda x: x.split(",")[0] if len(x.split()) > 1 else None)
)
data['OS Image'].update(data['OS Image'].apply(
lambda x: x.split(";")[0] if len(x.split()) > 1 else None)
)

import csv
s='''Device,Serial Number,Realm,Vendor,Model,OS Image
ABBNWX0100,SMG3453ESDN,BlAH BLAH,Cisco,WS-C6509-E,"IOS 12.2(33)SXI9, s72033_rp-ADVENTERPRISEK9_WAN-M"
ABBNWX0101,SDG127343S0,BLAH BLAH,Cisco,WS-C4506-E,"IOS 12.2(53)SG8, cat4500-IPBASEK9-M"
ABBNWX0102,TREFDSFY1KK,BLAH BLAH,Cisco,WS-C3560V2-48PS-S,"IOS 12.2(55)SE5, C3560-IPBASEK9-M"'''
print("\n".join([','.join(row[:5])+","+str(row[5].split(",")[0]) for row in csv.reader(s.split("\n"))]))
Converting list comprehension into loops for more readability:
newtext=""
for row in csv.reader(s.split("\n")):
newtext+=','.join(row[:5])+","+str(row[5].split(",")[0])+"\n"
print(newtext)
Output:
Device,Serial Number,Realm,Vendor,Model,OS Image
ABBNWX0100,SMG3453ESDN,BlAH BLAH,Cisco,WS-C6509-E,IOS 12.2(33)SXI9
ABBNWX0101,SDG127343S0,BLAH BLAH,Cisco,WS-C4506-E,IOS 12.2(53)SG8
ABBNWX0102,TREFDSFY1KK,BLAH BLAH,Cisco,WS-C3560V2-48PS-S,IOS 12.2(55)SE5
https://ideone.com/FMJCrO
For a file you will have to use
with open(fname) as f:
content = f.readlines()
content will contain a list of lines in the file and then use csv.reader(content)

Related

Extract Invalid Data From Dataframe to a File (.txt)

First time post here and new to python. My program should take a json file and convert it to csv. I have to check each field for validity. For a record that does not have all valid fields, I need to output those records to file. My question is, how would I take the a invalid data entry and save it to a text file? Currently, the program can check for validity but I do not know how to extract the data that is invalid.
import numpy as np
import pandas as pd
import logging
import re as regex
from validate_email import validate_email
# Variables for characters
passRegex = r"^(?!.*\s)(?=.*[A-Z])(?=.*[a-z])(?=.*\d).{8,50}$"
nameRegex = r"^[a-zA-Z0-9\s\-]{2,80}$"
# Read in json file to dataframe df variable
# Read in data as a string
df = pd.read_json('j2.json', dtype={'string'})
# Find nan values and replace it with string
#df = df.replace(np.nan, 'Error.log', regex=True)
# Data validation check for columns
df['accountValid'] = df['account'].str.contains(nameRegex, regex=True)
df['userNameValid'] = df['userName'].str.contains(nameRegex, regex=True)
df['valid_email'] = df['email'].apply(lambda x: validate_email(x))
df['valid_number'] = df['phone'].apply(lambda x: len(str(x)) == 11)
# Prepend 86 to phone number column
df['phone'] = ('86' + df['phone'])
Convert dataframe to csv file
df.to_csv('test.csv', index=False)
The json file I am using has thousands of rows
Thank you in advance!

Loading multiple txt files into list, single column and do not want it delimeted

This code works as is and pulls from each txt/XML file i have, and inserts into snowflake, however when using LoadTXT or GenfromTXT it is adding in brackets [ ] and separating the values in the file. I do not want to separate anything, I want this XML/TXT file to be loaded as is into a single row on a table. Is there something else I can use to stage the data before it makes it into the dataframe, or is there a better process to use to get from txt file to snowflake table.
I have 3 files named : test.xml , test1.xml, test2.xml
contents are the words seen below in a single line with only spaces. Long term this will be full XML files inserted into a single row in the table
import glob
import os
import numpy as np
import snowflake.connector
import pandas as pd
from datetime import datetime
from snowflake.connector.pandas_tools import write_pandas
os.environ["REQUESTS_CA_BUNDLE"] = r'C:\Certificates\cacert.pem'
os.environ["SSL_CERT_FILE"] = r'C:\Certificates\cacert.pem'
ctx = snowflake.connector.connect(
user='email',
password='pass',
account='server',
warehouse='SB',
database='SB',
schema='SB'
)
dated = datetime.today().strftime('%Y-%m-%d')
source_dir = r'C:\Users\jSmith\.spyder-py3\SampleXML'
table_name = 'LV_XML'
file_list = glob.glob(source_dir + '/*.XML')
data = []
for file_path in file_list:
data.append(
np.loadtxt(file_path,dtype='str'))
df = pd.DataFrame(list(zip(data)),
columns =['SRC_XML'])
df["TPR_AS_OF_DT"] = dated
success, nchunks, nrows, _ = write_pandas(ctx, df, table_name,database='SB', schema = 'SB',quote_identifiers=False)
print(str(success) + ', ' + str(nchunks) + ', ' + str(nrows))
ctx.close()
Currently this is what the list looks like:
and this is what it looks like after inserting into the Snowflake:

In Pandas, how can I extract certain value using the key off of a dataframe imported from a csv file?

Using Pandas, I'm trying to extract value using the key but I keep failing to do so. Could you help me with this?
There's a csv file like below:
value
"{""id"":""1234"",""currency"":""USD""}"
"{""id"":""5678"",""currency"":""EUR""}"
I imported this file in Pandas and made a DataFrame out of it:
dataframe from a csv file
However, when I tried to extract the value using a key (e.g. df["id"]), I'm facing an error message.
I'd like to see a value 1234 or 5678 using df["id"]. Which step should I take to get it done? This may be a very basic question but I need your help. Thanks.
The csv file isn't being read in correctly.
You haven't set a delimiter; pandas can automatically detect a delimiter but hasn't done so in your case. See the read_csv documentation for more on this. Because the , the pandas dataframe has a single column, value, which has entire lines from your file as individual cells - the first entry is "{""id"":""1234"",""currency"":""USD""}". So, the file doesn't have a column id, and you can't select data by id.
The data aren't formatted as a pandas df, with row titles and columns of data. One option is to read in this data is to manually process each row, though there may be slicker options.
file = 'test.dat'
f = open(file,'r')
id_vals = []
currency = []
for line in f.readlines()[1:]:
## remove obfuscating characters
for c in '"{}\n':
line = line.replace(c,'')
line = line.split(',')
## extract values to two lists
id_vals.append(line[0][3:])
currency.append(line[1][9:])
You just need to clean up the CSV file a little and you are good. Here is every step:
# open your csv and read as a text string
with open('My_CSV.csv', 'r') as f:
my_csv_text = f.read()
# remove problematic strings
find_str = ['{', '}', '"', 'id:', 'currency:','value']
replace_str = ''
for i in find_str:
my_csv_text = re.sub(i, replace_str, my_csv_text)
# Create new csv file and save cleaned text
new_csv_path = './my_new_csv.csv' # or whatever path and name you want
with open(new_csv_path, 'w') as f:
f.write(my_csv_text)
# Create pandas dataframe
df = pd.read_csv('my_new_csv.csv', sep=',', names=['ID', 'Currency'])
print(df)
Output df:
ID Currency
0 1234 USD
1 5678 EUR
You need to extract each row of your dataframe using json.loads() or eval()
something like this:
import json
for row in df.iteritems():
print(json.loads(row.value)["id"])
# OR
print(eval(row.value)["id"])

Converting date and time format when importing csv file in Python

I haven't been able to find a solution in similar questions yet so I'll have to give it a go here.
I am importing a csv file looking like this in notepad:
",""ItemName"""
"Time,""Raw Values"""
"7/19/2019 10:31:29 PM,"" 0"","
"7/19/2019 10:32:01 PM,"" 1"","
What I want when I save it as a new csv, is to reformat the date/time and the corresponding value to this (required by analysis software): The semicolon as separator and in the end is important, and I don't really need a header.
2019-07-19 22:31:29;0;
2019-07-19 22:32:01;1;
This is what it looks like in Python:
Item1 = pd.read_csv(r'.\Datafiles\ItemName.csv')
Item1
#Output:
# ,"ItemName"
# 0 Time,"Raw Values"
# 1 7/19/2019 10:31:29 AM," 0",
# 2 7/19/2019 10:32:01 AM," 1",
valve_G1.dtypes
# ,"ItemName" object
# dtype: object
I have tried using datetime without any luck but there might be something fishy with the datatypes that I am not aware of.
What you want in principle is read to DataFrame, convert datetime column and export df to csv again. I think you will need to get rid of the quote-chars to get the import correct. You can do so by reading the file content to a string, replace the '"', and feed that string to pandas.read_csv. EX:
import os
from io import StringIO
import pandas as pd
# this is just to give an example:
s='''",""ItemName"""
"Time,""Raw Values"""
"7/19/2019 10:31:29 PM,"" 0"","
"7/19/2019 10:32:01 PM,"" 1"","'''
f = StringIO(s)
# in your script, make f a file pointer instead, e.g.
# with open('path_to_input.csv', 'r') as f:
# now get rid of the "
csvcontent = ''
for row in f:
csvcontent += row.replace('"', '')
# read to DataFrame
df = pd.read_csv(StringIO(csvcontent), sep=',', skiprows=1, index_col=False)
df['Time'] = pd.to_datetime(df['Time'])
# save cleaned output as ;-separated csv
dst = 'path_where_to_save.csv'
df.to_csv(dst, index=False, sep=';', line_terminator=';'+os.linesep)

Read multiple txt files into Dict into Pandas dataframe

I am trying to load multiple txt files into dataframe. I know how to load urls, csv, and excel, but I couldnt find any reference on how to load multiple txt files into dataframe and match with dictionary or viceversa.
the text file are not comma or tab separated just plain text containing plain text song lyrics.
I checked the pandas documents any assistance welcome.
https://pandas.pydata.org/pandas-docs/stable/reference/io.html
Ideally the dataframe
the dataframe I hope to achieve would be like this example
| lyrics
-------------+-----------------------------------------------------------------------------------------
bonjovi | some text from the text files HiHello! WelcomeThank you Thank you for coming.
-------------+---------------------------------------------------------------------------------------
lukebryan | some other text from the text files.Hi.Hello WelcomeThank you Thank you for coming.
-------------+-----------------------------------------------------------------------------------------
johnprine | yet some text from the text files. Hi.Hello WelcomeThank you Thank you for coming.
Basic example
folder structure /lyrics/
urls =
'lyrics/bonjovi.txt',
'lyrics/lukebryan.txt',
'lyrics/johnprine.txt',
'lyrics/brunomars.txt',
'lyrics/methodman.txt',
'lyrics/bobmarley.txt',
'lyrics/nickcannon.txt',
'lyrics/weeknd.txt',
'lyrics/dojacat.txt',
'lyrics/ladygaga.txt',
'lyrics/dualipa.txt',
'lyrics/justinbieber.txt',]
muscian names
bands = ['bonjovi', 'lukebryan', 'johnprine', 'brunomars', 'methodman', 'bobmarley', 'nickcannon', 'weeknd', 'dojacat', 'ladygaga', 'dualipa', 'justinbieber']
Open the text files
the files are in directory lyrics/ from where I running my Jupyter notebook.
for i, c in enumerate(bands):
with open("lyrics/" + c + ".txt", "wb") as file:
pickle.dump(lyrics[i], file)
Double check to make sure data has been loaded properly
data.keys()
hopefully get result like this
dict_keys(['bonjovi', 'lukebryan', 'johnprine', 'brunomars', 'methodman', 'bobmarley', 'nickcannon', 'weeknd', 'dojacat', 'ladygaga', 'dualipa', 'justinbieber'])
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
# We are going to change this to key: artist, value: string format
def combine_text(list_of_text):
'''Takes a list of text and combines them into one large chunk of text.'''
combined_text = ' '.join(list_of_text)
return combined_text
We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['lyrics']
data_df = data_df.sort_index()
data_df
import os
import re
import pandas as pd
#get full path of txt file
filePath = []
for file in os.listdir("./lyrics"):
filePath.append(os.path.join("./lyrics", file))
#pull file name from text file with regex, capturing the text before the .txt
fileName = re.compile('\\\\(.*)\.txt')
#make empty dict Data with the key as the file name, and the value as the words in the file.
data = {}
for file in filePath:
#capturing file name
key = fileName.search(file)
with open(file, "r") as readFile:
# note that key[1] is the capture group from our search, and that the text is put into a list.
data[key[1]] = [readFile.read()]
#make dataframe from dict, and rename columns.
df = pd.DataFrame(data).T.reset_index().rename(columns = {'index':'bands', 0:'lyrics'})
This is how I would do it. Notice I generalized the file manipulation, so I don't have to worry about manually making the list for the keys, and ensure everything matches up.

Categories