I am trying to call scikit learn fit functions on dataframes where the elements of each column are numpy arrays. However, I get the error "setting an array element with a sequence," presumably because I am trying to call fit on a dataframe of arrays rather than scalar values. How do I work around this? I'd really appreciate some help.
Here is my code. You can find the data I'm using here: https://competitions.codalab.org/competitions/21163
training_data = pd.read_csv('/train.tsv', sep='\t')
testing_data = pd.read_csv('/dev.tsv', sep='\t')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True,max_length=1024)
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)
# These are used to map the data to their appropriate column on each pass
pomt_train_x = pd.DataFrame(columns=["claim", "reason", "category", "speaker", "checker", "tags", "claim entities", "article title"])
feature_dict = {1: "claim", 4: "reason", 5: "category", 6: "speaker", 7: "checker", 8: "tags", 9: "claim entities", 10: "article title"}
# Sort the data appropriately.
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
if 'pomt' in data[0]:
appended_data = {}
for j, sentence in enumerate(data):
if j in feature_dict:
inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
outputs = model(**inputs)
appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
print(f"{i + 1} out of {training_data.index.stop} from training")
count = 0
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
if 'pomt' in data[0]:
appended_data = {}
for j, sentence in enumerate(data):
if j in feature_dict:
inputs = tokenizer(str(sentence), return_tensors="pt", max_length=512, pad_to_max_length=True).to(device)
outputs = model(**inputs)
appended_data[feature_dict[j]] = outputs.last_hidden_state[:,0][0].cpu().detach().numpy()
pomt_train_x = pomt_train_x.append(appended_data, ignore_index=True)
print(f"{i + 1} out of {testing_data.index.stop} from testing")
count += 1
# Map the possible labels to an emotion
positive_set = set(['half-true', 'correct attribution!', 'correct', 'determination: barely true', 'factscan score: true',
'correct attribution', 'mostly true', 'mostly-correct', 'truth!', 'partially true', 'half true',
'mostly truth!', 'determination: true', 'true messages', 'authorship confirmed!', 'verdict: true',
'mostly_true', 'determination: mostly true', 'confirmed authorship!', 'conclusion: accurate', 'accurate',
'true', 'partly true', 'fact', 'full flop', 'in-the-green', 'verified'])
negative_set = set({'fake news', 'verdict: false', '3 pinnochios', 'fiction!', 'bogus warning', 'we rate this claim false',
'determination: false', 'disputed!', 'false', 'fiction', 'a lot of baloney', '2 pinnochios', 'some baloney',
'mostly_false', 'cherry picks', 'miscaptioned', 'misleading!', 'misleading recommendations', 'mostly fiction!',
'mostly false', 'a little baloney', 'fiction! & satire!', 'conclusion: false', 'rating: false',
'determination: misleading', 'promise broken', '4 pinnochios', 'misleading', 'promise kept',
'misattributed', 'fake', 'previously truth! now resolved!','incorrect attribution!', 'incorrect',
'spins the facts', 'determination: a stretch', 'factscan score: misleading', 'pants on fire!',
'factscan score: false', 'exaggerates', 'outdated', 'facebook scams', 'unsupported', 'opinion!',
'verdict: unsubstantiated', 'scam', 'virus!', 'no flip', 'scam!', 'unverified', 'distorts the facts', 'outdated!'
'understated', 'no evidence', 'unproven!', 'inaccurate attribution!', 'statirical reports', 'unproven', 'exaggerated',
'determination: huckster propaganda', 'grass roots movement!', 'commentary!', 'in-the-red', 'unsubstantiated messages',})
neutral_set = set({'truth! & fiction!', 'conclusion: unclear', '1', 'unobservable', 'needs context', 'truth! & disputed!', 'half flip',
'0', 'in-between', '4', 'None', '2', 'none', 'investigation pending!','not the whole story', '10','in the works',
'truth! & misleading!', '3', 'mixture', 'not yet rated', 'legend', 'stalled', 'truth! & unproven!', 'truth! & outdated!',
'compromise'})
# Read in the labels for the appropriate data
pomt_train_y = pd.DataFrame(columns=["label"])
sign_to_append = 0
for i, data in enumerate(training_data[training_data.columns].to_numpy()):
if 'pomt' in data[0]:
if data[2] in positive_set:
sign_to_append = 1
elif data[2] in negative_set:
sign_to_append = -1
else:
sign_to_append = 0
pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
print(f"{i + 1} out of {training_data.index.stop} from training")
# append testing data to training data
for i, data in enumerate(testing_data[testing_data.columns].to_numpy()):
if 'pomt' in data[0]:
if data[2] in positive_set:
sign_to_append = 1
elif data[2] in negative_set:
sign_to_append = -1
else:
sign_to_append = 0
pomt_train_y = pomt_train_y.append({'label':sign_to_append}, ignore_index=True)
print(f"{i + 1} out of {testing_data.index.stop} from testing")
pomt_X_train, pomt_X_test, pomt_Y_train, pomt_Y_test = train_test_split(pomt_train_x, pomt_train_y, test_size= (count / pomt_train_x.shape[0]), stratify=pomt_train_y)
pomt_Y_train = pomt_Y_train.astype("int")
pomt_Y_test = pomt_Y_test.astype("int")
# One Vs. One Multiclass Classification
clf = OneVsOneClassifier(SVC(C = 1, verbose=True))
# Fit to Training Data
clf.fit(pomt_X_train, pomt_Y_train)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
TypeError: only size-1 arrays can be converted to Python scalars
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-22-3314e23093e3> in <module>()
1 # Fit to Training Data
----> 2 clf.fit(pomt_X_train.squeeze(), pomt_Y_train)
3
4 # Training data accuracy
5 X_train_prediction = clf.predict(pomt_X_train)
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __array__(self, dtype)
1991
1992 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
-> 1993 return np.asarray(self._values, dtype=dtype)
1994
1995 def __array_wrap__(
ValueError: setting an array element with a sequence.
I figured out what to do on my own end. I basically just created a column in the dataframe to reflect each element of the list, not each list itself. It's a bit unintuitive but it works.
Related
So I have a data set with user, date, and post columns. I'm trying to generate a column of the calories that foods contain in the post column for each user. This dataset has a length of 21, and the code below finds the food words, get their calorie value, append it to that user's respective calorie list, and append that list to the new column. The new generated column, however, somehow has a length of 25:
Current data: 21
New column: 25
Does anybody know why this occurs? Here is the code below and samples of what the original dataset and the new column look like:
while len(col) < len(data['post']):
for post, api_id, api_key in zip(data['post'], ids_keys.keys(), ids_keys.values()): # cycles through text data & api keys
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'x-app-id': api_id,
'x-app-key': api_key,
'x-remote-user-id': '0'
}
calories = []
print('Current data:', len(data['post']), '\n New column: ', len(col)) # prints length of post vs new cal column
for word in eval(post):
if word not in food:
continue
else:
print('Detected Word: ', word)
query = {'query': '{}'.format(word)}
try:
response = requests.request("POST", url, headers=headers, data=query)
except KeyError as ke:
print(ke, 'Out of calls, next key...')
ids_keys.pop(api_id) # drop current api id & key from dict if out of calls
print('API keys left:', len(ids_keys))
finally:
stats = response.json()
print('Food Stats: \n', stats)
print('Calories in food: ', stats['foods'][0]['nf_calories'])
calories.append(stats['foods'][0]['nf_calories'])
print('Current Key', api_id, ':', api_key)
col.append(calories)
if len(col) == len(data['post']):
break
I attempted to use the while loop to only append up to the length of the dataset, but to no avail.
Original Data Set:
pd.DataFrame({'user':['avskk', 'janejellyn', 'firlena227','...'],
'date': ['October 22', 'October 22', 'October 22','...'],
'post': [['autumn', 'fully', 'arrived', 'cooking', 'breakfast', 'toaster','...'],
['breakfast', 'chinese', 'sticky', 'rice', 'tempeh', 'sausage', 'cucumber', 'salad', 'lunch', 'going', 'lunch', 'coworkers', 'probably', 'black', 'bean', 'burger'],
['potato', 'bean', 'inspiring', 'food', 'day', 'today', '...']]
})
New Column:
pd.DataFrame({'Calories': [[22,33,45,32,2,5,7,9,76],
[43,78,54,97,32,56,97],
[23,55,32,22,7,99,66,98,54,35,33]]
})
I want to connect separated messages of a chat, so I created a list for all the dictionaries
messages = ["Hello", "How you doing","fine","how can I help you", "how to do this?", "like this", "thanks","man","no problem"]
Person1= [True,True,False,False,True,False,True,True,False]
data =[]
chat_messages = messages
people = Person1
k = 0
for i in range(len(messages)):
if people[i] == people[i+1]:
chat_messages[i+1] = chat_messages[i] +" " +chat_messages[i+1]
chatData = {'text': chat_messages[i+1], 'person1': people[i]}
data[k] = chatData
else:
k +=1
chatData = {'text': chat_messages[i+1], 'person1': people[i+1]}
print(chatData )
data[k] = chatData
print(data)
I'm getting errors in here
File "main.py", line 20, in <module>
data[k] = chatData
IndexError: list assignment index out of range
How can I fix it please?
I want the output of data to look like this:
data = [{'text': 'Hello How you doing', 'person1': True} , {'text': 'fine how can I help you', 'person1': False}, {'text': 'how to do this?', 'person1': True}]
You cant add elements to a list in python this way. you have to use python method append().
data.append(chatData)
This method will add elements at the end of the list.
You can learn more python list methods using this link
https://www.geeksforgeeks.org/list-methods-python/
The problem is that when you add the index i + 1, it gives an error when you get to the nr 9, because your list index stops at 8. Here is my solution:
messages = ["Hello", "How you doing","fine","how can I help you", "how to do this?", "like this", "thanks","man","no problem"]
Person1= [True,True,False,False,True,False,True,True,False]
data =[]
chat_messages = messages
people = Person1
k = 0
data = []
for i, msg in enumerate(messages):
try:
if people[i] == people[i+1]:
chat_messages[i+1] = chat_messages[i] +" " +chat_messages[i+1]
data.append({'text': chat_messages[i+1], 'person1': people[i]})
except:
pass
print(data)
messages = ["Hello", "How you doing","fine","how can I help you", "how to do this?", "like this", "thanks","man","no problem"]
Person1= [True,True,False,False,True,False,True,True,False]
data =[]
chat_messages = messages
people = Person1
k = 0
for i in range(len(messages)):
if len(messages)-1 is i:
None
else:
if people[i] == people[i+1]:
chat_messages[i+1] = chat_messages[i] +" " +chat_messages[i+1]
chatData = {'text': chat_messages[i+1], 'person1': people[i]}
data.append(chatData)
else:
chatData = {'text': chat_messages[i+1], 'person1': people[i+1]}
print(chatData )
data.append(chatData)
print(data)
SpaCy has implemented a sense2vec word embeddings package which they document here
The vectors are all of the form WORD|POS. For example, the sentence
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of trouble
Needs to be converted into
Dear|ADJ local|ADJ newspaper|NOUN ,|PUNCT I|PRON think|VERB effects|NOUN computers|NOUN have|VERB on|ADP people|NOUN are|VERB great|ADJ learning|NOUN skills/affects|NOUN because|ADP they|PRON give|VERB us|PRON time|NOUN to|PART chat|VERB with|ADP friends/new|ADJ people|NOUN ,|PUNCT helps|VERB us|PRON learn|VERB about|ADP the|DET globe(astronomy|NOUN )|PUNCT and|CONJ keeps|VERB us|PRON out|ADP of|ADP trouble|NOUN !|PUNCT
In order to be interpretable by the sense2vec pretrained embeddings and in order to be in the sense2vec format.
How can this be done?
Based off of SpaCy's bin/merge.py implementation which does exactly what is needed:
from spacy.en import English
import re
LABELS = {
'ENT': 'ENT',
'PERSON': 'ENT',
'NORP': 'ENT',
'FAC': 'ENT',
'ORG': 'ENT',
'GPE': 'ENT',
'LOC': 'ENT',
'LAW': 'ENT',
'PRODUCT': 'ENT',
'EVENT': 'ENT',
'WORK_OF_ART': 'ENT',
'LANGUAGE': 'ENT',
'DATE': 'DATE',
'TIME': 'TIME',
'PERCENT': 'PERCENT',
'MONEY': 'MONEY',
'QUANTITY': 'QUANTITY',
'ORDINAL': 'ORDINAL',
'CARDINAL': 'CARDINAL'
}
nlp = False;
def tag_words_in_sense2vec_format(passage):
global nlp;
if(nlp == False): nlp = English()
if isinstance(passage, str): passage = passage.decode('utf-8',errors='ignore');
doc = nlp(passage);
return transform_doc(doc);
def transform_doc(doc):
for index, ent in enumerate(doc.ents):
ent.merge(ent.root.tag_, ent.text, LABELS[ent.label_])
#if index % 100 == 0: print ("enumerating at entity index " + str(index));
#for np in doc.noun_chunks:
# while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
# np = np[1:]
# np.merge(np.root.tag_, np.text, np.root.ent_type_)
strings = []
for index, sent in enumerate(doc.sents):
if sent.text.strip():
strings.append(' '.join(represent_word(w) for w in sent if not w.is_space))
#if index % 100 == 0: print ("converting at sentence index " + str(index));
if strings:
return '\n'.join(strings) + '\n'
else:
return ''
def represent_word(word):
if word.like_url:
return '%%URL|X'
text = re.sub(r'\s', '_', word.text)
tag = LABELS.get(word.ent_type_, word.pos_)
if not tag:
tag = '?'
return text + '|' + tag
Where
print(tag_words_in_sense2vec_format("Dear local newspaper, ..."))
results in
Dear|ADJ local|ADJ newspaper|NOUN ,|PUNCT ...
Above is the input table i have in csv
I am trying to use array and while loops in python. I am new to this language. Loops should occur twice to give Category\sub-category\sub-category_1 order...I am trying to use split().Ouput should be like below
import csv
with open('D:\\test.csv', 'rb') as f:
reader = csv.reader(f, delimiter='',quotechar='|')
data = []
for name in reader:
data[name] = []
And if you read the lines of your csv and access the data then you can manipulate the way you want later.
cats = {}
with open('my.csv', "r") as ins:
# check each line of the fine
for line in ins:
# remove double quotes: replace('"', '')
# remove break line : rstrip()
a = str(line).replace('"', '').rstrip().split('|')
if a[0] != 'CatNo':
cats[int(a[0])] = a[1:];
for p in cats:
print 'cat_id: %d, value: %s' % (p, cats[p])
# you can access the value by the int ID
print cats[1001]
the output:
cat_id: 100, value: ['Best Sellers', 'Best Sellers']
cat_id: 1001, value: ['New this Month', 'New Products\\New this Month']
cat_id: 10, value: ['New Products', 'New Products']
cat_id: 1003, value: ['Previous Months', 'New Products\\Previous Months']
cat_id: 110, value: ['Promotional Material', 'Promotional Material']
cat_id: 120, value: ['Discounted Products & Special Offers', 'Discounted Products & Special Offers']
cat_id: 1002, value: ['Last Month', 'New Products\\Last Month']
['New this Month', 'New Products\\New this Month']
Updated script for your question:
categories = {}
def get_parent_category(cat_id):
if len(cat_id) <= 2:
return '';
else:
return cat_id[:-1]
with open('my.csv', "r") as ins:
for line in ins:
# remove double quotes: replace('"', '')
# remove break line : rstrip()
a = str(line).replace('"', '').rstrip().split('|')
cat_id = a[0]
if cat_id != 'CatNo':
categories[cat_id] = {
'parent': get_parent_category(cat_id),
'desc': a[1],
'long_desc': a[2]
};
print 'Categories relations:'
for p in categories:
parent = categories[p]['parent']
output = categories[p]['desc']
while parent != '':
output = categories[parent]['desc'] + ' \\ ' + output
parent = categories[parent]['parent']
print '\t', output
output:
Categories relations:
New Products
New Products \ Best Sellers
New Products \ Discounted Products & Special Offers
New Products \ Best Sellers \ Previous Months
New Products \ Best Sellers \ Last Month
New Products \ Best Sellers \ New this Month
I am experiencing a strange faulty behaviour, where a dictionary is only appended once and I can not add more key value pairs to it.
My code reads in a multi-line string and extracts substrings via split(), to be added to a dictionary. I make use of conditional statements. Strangely only the key:value pairs under the first conditional statement are added.
Therefore I can not complete the dictionary.
How can I solve this issue?
Minimal code:
#I hope the '\n' is sufficient or use '\r\n'
example = "Name: Bugs Bunny\nDOB: 01/04/1900\nAddress: 111 Jokes Drive, Hollywood Hills, CA 11111, United States"
def format(data):
dic = {}
for line in data.splitlines():
#print('Line:', line)
if ':' in line:
info = line.split(': ', 1)[1].rstrip() #does not work with files
#print('Info: ', info)
if ' Name:' in info: #middle name problems! /maiden name
dic['F_NAME'] = info.split(' ', 1)[0].rstrip()
dic['L_NAME'] = info.split(' ', 1)[1].rstrip()
elif 'DOB' in info: #overhang
dic['DD'] = info.split('/', 2)[0].rstrip()
dic['MM'] = info.split('/', 2)[1].rstrip()
dic['YY'] = info.split('/', 2)[2].rstrip()
elif 'Address' in info:
dic['STREET'] = info.split(', ', 2)[0].rstrip()
dic['CITY'] = info.split(', ', 2)[1].rstrip()
dic['ZIP'] = info.split(', ', 2)[2].rstrip()
return dic
if __name__ == '__main__':
x = format(example)
for v, k in x.iteritems():
print v, k
Your code doesn't work, at all. You split off the name before the colon and discard it, looking only at the value after the colon, stored in info. That value never contains the names you are looking for; Name, DOB and Address all are part of the line before the :.
Python lets you assign to multiple names at once; make use of this when splitting:
def format(data):
dic = {}
for line in data.splitlines():
if ':' not in line:
continue
name, _, value = line.partition(':')
name = name.strip()
if name == 'Name':
dic['F_NAME'], dic['L_NAME'] = value.split(None, 1) # strips whitespace for us
elif name == 'DOB':
dic['DD'], dic['MM'], dic['YY'] = (v.strip() for v in value.split('/', 2))
elif name == 'Address':
dic['STREET'], dic['CITY'], dic['ZIP'] = (v.strip() for v in value.split(', ', 2))
return dic
I used str.partition() here rather than limit str.split() to just one split; it is slightly faster that way.
For your sample input this produces:
>>> format(example)
{'CITY': 'Hollywood Hills', 'ZIP': 'CA 11111, United States', 'L_NAME': 'Bunny', 'F_NAME': 'Bugs', 'YY': '1900', 'MM': '04', 'STREET': '111 Jokes Drive', 'DD': '01'}
>>> from pprint import pprint
>>> pprint(format(example))
{'CITY': 'Hollywood Hills',
'DD': '01',
'F_NAME': 'Bugs',
'L_NAME': 'Bunny',
'MM': '04',
'STREET': '111 Jokes Drive',
'YY': '1900',
'ZIP': 'CA 11111, United States'}