I got "Pandas ValueError Arrays Must be All Same Length"
Before I start, I checked answers to similar problems and folks suggest to use something like:
DataFrame(dict([ (k,Series(v)) for k,v in d.iteritems() ]))
if you have only two values in dictionary or,
a = {'Links' : lines ,'Titles' : titles , 'Singers': finalsingers , 'Albums':finalalbums , 'Years' : years}
df = pd.DataFrame.from_dict(a, orient='index')
df.transpose()
But neither of them worked for me. What my code does is, goes to file in directory, captures the name and last_modified time, opens the file and use it in function called phash and returns a value. I think there could be a problem with phash function, maybe sometimes it returns a null value.
So in my case data is something like this:
raw_data = {}
hash_11 = []
time_1 = []
file_name_1 = []
for file in date_file_list:
try:
#print(file[1])
y = file[1]
file_name = os.path.basename(y) # extract just the filename or #file_name = os.path.split(file[1])
file_name = file_name.split('_-_')[0]
file_name_1.append(file_name)
#print(file_name)
# convert date tuple to MM/DD/YYYY HH:MM:SS format
#file_date = time.strftime("%m/%d/%y %H:%M:%S", file[0])
time = file[0]
time_1.append(time)
img = Image.open(str(file[1]))
hash_1 = imagehash.dhash(img)
hash_11.append(hash_1)
#hash_1 = str(hash_1)
#data = [hash_1, time, file_name]
#hamming_list.append(data)
#print(file_name, hash_1, file_date)
data ={'hash_1': hash_11,'time': time_1, 'file_name': file_name_1}
raw_data.update(data)
except:
pass
df = pd.DataFrame(raw_data, columns = ['hash_1', 'time','file_name'])
Related
I use pylucence 9.4.1 to index a document and I just noticed a weird problem. There are some words, e.g. 'baby', that are present in the document but pylucene is unable to find them in the index.
This is my code to index the document:
(The document can be downloaded from here.
filepath = os.getcwd() + '/' + 'wiki_movie_plots_deduped.csv'
def indexDocument(title, year, plot):
ft = FieldType()
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
doc = document.Document()
doc.add(document.Field("Title", title, ft))
doc.add(document.Field("Plot", plot, ft))
writer.addDocument(doc)
def CloseWriter():
writer.close()
def makeInvertedIndex(file_path):
df = pd.read_csv(file_path)
print(df.columns)
docid = 0
for i in df.index:
print(docid, '-', df['Title'][i])
indexDocument(df['Title'][i], df['Release Year'][i], df['Plot'][i])
docid += 1
indexPath = File('index/').toPath()
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
inverted = makeInvertedIndex(filepath)
CloseWriter()
This is the code to search the created index for a keyword:
keyword = 'baby'
fieldname = 'Title'
result = list()
indexPath = File('index/').toPath()
directory = FSDirectory.open(indexPath)
analyzer = StandardAnalyzer()
reader = DirectoryReader.open(directory)
searcher = IndexSearcher(DirectoryReader.open(directory))
query = QueryParser(fieldname, analyzer).parse(keyword)
print('query', query)
numdocs = searcher.count(query)
print("#-docs:", numdocs)
searcher.setSimilarity(BM25Similarity(1.2,0.75))
scoreDocs = searcher.search(query, 1000).scoreDocs # it returns TopDocs object containing scoreDocs and totalHits
# scoreDoc object contains docId and score
print('total hit:', searcher.search(query, 100).totalHits)
print("%s total matching documents" % (len(scoreDocs)))
Any help to understand the problem is appreciated.
The code is required to take addresses from a csv file and then use a function to compute the corresponding Latitudes and Longitudes. While I get the correct Latitudes and Longitudes but I am unable to save them to a new csv file.
import requests
import urllib.parse
import pandas as pd
#function to get the Coordinates:
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return
#function is called to get the 5 Address Values from the CSV File and pass on to the function
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
lat_long(add)
Output is:
Latitude Longitude
34.0096961 71.8990106
34.0123846 71.5787458
33.6038766 73.048136
33.6938118 73.0651511
24.8546842 67.0207055
I want to save this output into a new file and I am unable to get the results.
Just a small modification might help
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
Lat = response[0]["lat"]
Long = response[0]["lon"]
return Lat, Long
Lat_List = []
Long_List = []
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
Lat =lat_long(add)[0]
Long = lat_long(add)[1]
Lat_List.append(Lat)
Long_List.append(Long)
df1 = pd.DataFrame(data, columns=['Latitude', 'Longitude])
df1['Latitude'] = Lat_List
df1['Longitude'] = Long_List
df1.to_csv("LatLong.csv)
#one line of change here
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return response[0]["lat"], response[0]["lon"] # return the lat and long
# three lines added here
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
l=[] # define empty list
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
l.append(lat_long(add)) # append to the empty l
# create a dataframe and output as csv
pd.DataFrame(l, columns=['Longitude', 'Latitude']).to_csv('test.csv', sep= ' ')
How do I change name for each csv file? I am getting new data for each token. Also can I give the csv files, the real name of token?
Example:
Token 492033 = kotakbank.csv, 738561 = reliance.csv, 341249 = hdfcbank.csv
i = [492033, 738561, 341249] #Token list
for _ in list(i):
def get_data():
"""Fetch Data from Kite"""
to_date = datetime.now(timezone('Asia/Calcutta'))
from_date = to_date - timedelta(days=5)
interval = '5minute'
instrument_token = _
data = kite.historical_data(instrument_token, from_date, to_date, interval, continuous=False, oi=False)
data = pd.DataFrame(data)
data.index = data.date
data.drop(columns=['date'], inplace=True)
df.to_csv('datafile1.csv') #Saving into csv file
return data
df = get_data()
To use the custom filenames, you could pass the token and filename to the function like this:
def get_data(instrument_token, filename):
"""Fetch Data from Kite"""
to_date = datetime.now(timezone('Asia/Calcutta'))
from_date = to_date - timedelta(days=5)
interval = '5minute'
data = kite.historical_data(instrument_token, from_date, to_date, interval, continuous=False, oi=False)
data = pd.DataFrame(data)
data.index = data.date
data.drop(columns=['date'], inplace=True)
data.to_csv(filename) #Saving into csv file
return data
Call the function like this:
df = get_data(492033, "kotakbank.csv")
You could loop it like this:
token_list = [[492033, "kotakbank.csv"], [738561, "reliance.csv"], [341249, "hdfcbank.csv"]]
for i in token_list:
df = get_data(i[0], i[1])
I am using the below code to form a JSON, by reading data from csv
df = pd.read_csv('/testdata.csv', dtype={
"debt_type": str,
"debt_amount": int,
"interest_rate": float,
"total_monthly_payment": int,
"remaining_term,interest_payable": int})
finalList = []
finalDict = {}
grouped = df.groupby(['debt_type'])
for key, value in grouped:
dictionary = {}
j = grouped.get_group(key).reset_index(drop=True)
dictionary['debt_type'] = j.at[0, 'debt_type']
dictList = []
anotherDict = {}
for i in j.index:
anotherDict['debt_amount'] = j.at[i, 'debt_amount']
anotherDict['interest_rate'] = j.at[i, 'interest_rate']
anotherDict['total_monthly_payment'] = j.at[i, 'total_monthly_payment']
anotherDict['remaining_term'] = j.at[i, 'remaining_term']
anotherDict['interest_payable'] = j.at[i, 'interest_payable']
dictList.append(anotherDict)
dictionary['loan_info'] = dictList
finalList.append(dictionary)
finalDict = finalList
and want to achieve below
{"loan_info":{"debt_amount":9000,"interest_rate":23,"total_monthly_payment":189,"remaining_term":129,"interest_payable":15356},"debt_type":"credit_card"}
however, what I am getting is below
[{'debt_type': 'credit_card', 'loan_info': [{'debt_amount': 9000, 'interest_rate': 12.2, 'total_monthly_payment': 189, 'remaining_term': 129, 'interest_payable': 15256}]}]
can anyone help here. thanks in advance.
I think what you need is to use pandas.DataFrame.to_dict() and pandas.DataFrame.to_json().
Right after you read your csv file, you can create a new column loan_info that will format all the fields you want to a Python dictionary :
loan_info_cols = ['debt_amount', 'interest_rate', 'total_monthly_payment', 'remaining_term', 'interest_payable']
df['loan_info'] = df[loan_info_cols].apply(lambda x: x.to_dict(), axis=1)
Then drop the columns we just used :
df = df.drop(loan_info_cols, axis=1)
This is what we have so far :
print(df)
debt_type loan_info
0 credit_card {u'total_monthly_payment': 189.0, u'interest_p...
1 debit_card {u'total_monthly_payment': 165.0, u'interest_p...
Now you can convert the whole dataframe to JSON :
df_json = df.to_json(orient='records', lines=True)
print(df_json)
{"debt_type":"credit_card","loan_info":{"total_monthly_payment":189.0,"interest_payable":15356.0,"interest_rate":23.0,"debt_amount":9000.0,"remaining_term":129.0}}
{"debt_type":"debit_card","loan_info":{"total_monthly_payment":165.0,"interest_payable":21354.0,"interest_rate":24.0,"debt_amount":8000.0,"remaining_term":167.0}}
This is the portion of the code that's causing trouble:
import pandas as pd
import re
df
df.columns = ['Campaigns', 'Impressions', 'Attempts', 'Spend']
Campaigns = df['Campaigns']
IDs = []
for c in Campaigns:
num = re.search(r'\d{6}',c).group()
IDs.append(num)
pieces = [df,pd.DataFrame(IDs)]
frame = pd.concat(pieces, axis=1, join='outer',ignore_index=False)
frame['ID'] = frame[0]
del frame[0]
frame
This is the error:
Error: 'NoneType' object has no attribute 'group'
When I try things individually in ipython everything works, for example:
in>> test = 'YP_WON2_SP8_115436'
in>> num = re.search(r'\d{6}',test)
in>> num.group()
out>> '115436'
I've tried splitting up the code as above and it still throws the same error.
Fixed the code:
df
df.columns = ['Campaigns', 'Impressions', 'Attempts', 'Spend']
Campaigns = df['Campaigns']
ID = []
for c in Campaigns:
m = re.search(r'\d{6}',c)
if m:
num = re.search(r'\d{6}',c).group()
ID.append(num)
else:
ID.append('No ID')
pieces = [df,pd.DataFrame(ID)]
frame = pd.concat(pieces, axis=1, join='outer',ignore_index=False)
frame['ID'] = frame[0]
del frame[0]
frame