Populating python matrix - python

I'm doing the splitting of the words from the text file in python. I've receive the number of row (c) and a dictionary (word_positions) with index. Then I create a zero matrix (c, index). Here is the code:
from collections import defaultdict
import re
import numpy as np
c=0
f = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r')
for line in f:
c = c + 1
word_positions = {}
with open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r') as f:
index = 0
for word in re.findall(r'[a-z]+', f.read().lower()):
if word not in word_positions:
word_positions[word] = index
index += 1
print(word_positions)
matrix=np.zeros(c,index)
My question: How can I populate the matrix to be able to get this: matrix[c,index] = count, where c - is the number of row, index -the indexed position and count -the number of counted words in a row

Try next:
import re
import numpy as np
from itertools import chain
text = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt')
text_list = text.readlines()
c=0
for i in range(len(text_list)):
c=c+1
text_niz = []
for i in range(len(text_list)):
text_niz.append(text_list[i].lower()) # перевел к нижнему регистру
slovo = []
for j in range(len(text_niz)):
slovo.append(re.split('[^a-z]', text_niz[j])) # токенизация
for e in range(len(slovo)):
while slovo[e].count('') != 0:
slovo[e].remove('') # удалил пустые слова
slovo_list = list(chain(*slovo))
print (slovo_list) # составил список слов
slovo_list=list(set(slovo_list)) # удалил повторяющиеся
x=len(slovo_list)
s = []
for i in range(len(slovo)):
for j in range(len(slovo_list)):
s.append(slovo[i].count(slovo_list[j])) # посчитал количество слов в каждом предложении
matr = np.array(s) # матрица вхождений слов в предложения
d = matr.reshape((c, x)) # преобразовал в матрицу 22*254

It looks like you are trying to create something similar to an n-dimensional list. these are achieved by nesting lists inside themselves as such:
two_d_list = [[0, 1], [1, 2], [example, blah, blah blah]]
words = two_d_list[2]
single_word = two_d_list[2][1] # Notice the second index operator
This concept is very flexible in Python and can also be done with a dictionary nested inside as you would like:
two_d_list = [{"word":1}, {"example":1, "blah":3}]
words = two_d_list[1] # type(words) == dict
single_word = two_d_list[2]["example"] # Similar index operator, but for the dictionary
This achieves what you would like, functionally, but does not use the syntax matrix[c,index], however this syntax does not really exist in python for indexing. Commas within square-brackets usually delineate the elements of list literals. Instead you can access the row's dictionary's element with matrix[c][index] = count
You may be able to overload the index operator to achieve the syntx you want. Here is a question about achieving the syntax you desire. In summary:
Overload the __getitem__(self, inex) function in a wrapper of the list class and set the function to accept a tuple. The tuple can be created without parenthesis, giving the syntax matrix[c, index] = count

Related

split string row wise with condition in python

I have some strings in a column and I want to explode the words out only if they are not within brackets. The column looks like this
pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
and I want the output to look like this
pd.DataFrame(data={'a': ['first','string','(second,string)','third','string','(another,string,here)']})
This sort of works, but i would like to not have to put the row number in each time
re.split(r',(?![^()]*\))', x['a'][0])
re.split(r',(?![^()]*\))', x['a'][1])
re.split(r',(?![^()]*\))', x['a'][2])
i thought i could do with a lmbda function but i cannot get it to work. Thanks for checking this out
x['a'].apply(lambda i: re.split(r',(?![^()]*\))', i))
It is not clear to me if the elements in your DataFrame may have multiple groups between brackets. Given that doubt, I have implemented the following:
import pandas as pd
import re
df = pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
pattern = re.compile("([^\(]*)([\(]?.*[\)]?)(.*)", re.IGNORECASE)
def findall(ar, res = None):
if res is None:
res = []
m = pattern.findall(ar)[0]
if len(m[0]) > 0:
res.extend(m[0].split(","))
if len(m[1]) > 0:
res.append(m[1])
if len(m[2]) > 0:
return findall(ar[2], res = res)
else:
return res
res = []
for x in df["a"]:
res.extend(findall(x))
print(pd.DataFrame(data={"a":res}))
Essentially, you recursively scan the last part of the match until you find no more words between strings. If order was not an issue, the solution is easier.

Splitting an array into two arrays in Python

I have a list of numbers like so;
7072624 through 7072631
7072672 through 7072687
7072752 through 7072759
7072768 through 7072783
The below code is what I have so far, i've removed the word "through" and it now prints a list of numbers.
import os
def file_read(fname):
content_array = []
with open (fname) as f:
for line in f:
content_array.append(line)
#print(content_array[33])
#new_content_array = [word for line in content_array[33:175] for word in line.split()]
new_content_array = [word for line in content_array[33:37] for word in line.split()]
while 'through' in new_content_array: new_content_array.remove('through')
print(new_content_array)
file_read('numbersfile.txt')
This gives me the following output.
['7072624', '7072631', '7072672', '7072687', '7072752', '7072759', '7072768', '7072783']
So what I'm wanting to do but struggling to find is how to split the 'new_content_array' into two arrays so the output is as follows.
array1 = [7072624, 7072672, 7072752, 7072768]
array2 = [7072631, 7072687, 7072759, 7072783]
I then want to be able to take each value in array 2 from the value in array 1
7072631 - 7072624
7072687 - 7072672
7072759 - 7072752
7072783 - 7072768
I've been having a search but can't find anything similar to my situation.
Thanks in advance!
Try this below:
list_data = ['7072624', '7072631', '7072672', '7072687', '7072752', '7072759', '7072768', '7072783']
array1 = [int(list_data[i]) for i in range(len(list_data)) if i % 2 == 0]
array2 = [int(list_data[i]) for i in range(len(list_data)) if i % 2 != 0]
l = ['7072624', '7072631', '7072672', '7072687', '7072752', '7072759','7072768', '7072783']
l1 = [l[i] for i in range(len(l)) if i % 2 == 0]
l2 = [l[i] for i in range(len(l)) if i % 2 == 1]
print(l1) # ['7072624', '7072672', '7072752', '7072768']
print(l2) # ['7072631', '7072687', '7072759', '7072783']
result = list(zip(l1,l2))
As a result you will get:
[('7072624', '7072631'),
('7072672', '7072687'),
('7072752', '7072759'),
('7072768', '7072783')]
I think that as comprehension list, but you could also use filter
You could try to split line using through keyword,
then removing all non numeric chars such as new line or space using a lambda function and regex inside a list comprehension
import os
import re
def file_read(fname):
new_content_array = []
with open (fname) as f:
for line in f:
line_array = line.split('through')
new_content_array.append([(lambda x: re.sub(r'[^0-9]', "", x))(element) for element in line_array])
print(new_content_array)
file_read('numbersfile.txt')
Output looks like this:
[['7072624', '7072631'], ['7072672', '7072687'], ['7072752', '7072759'], ['7072768', '7072783']]
Then you just could extract first element of each nested list to store separately in a variable and so on with second element.
Good luck

How to compress a python list of strings in format string[number]/[number]

I have a list of strings ending with numbers. Want to sort them in python and then compress them if a range is formed.
Eg input string :
ABC1/3, ABC1/1, ABC1/2, ABC2/3, ABC2/2, ABC2/1
Eg output string:
ABC1/1-3, ABC2/1-3
How should I approach this problem with python?
There's no need to use a dict for this problem. You can simply parse the tokens into a list and sort it. By default Python sorts a list of lists by the individual elements of each list. After sorting the list of token pairs, you only need to iterate once and record the important indices. Try this:
# Data is a comma separated list of name/number pairs.
data = 'ABC1/3, ABC1/1, ABC1/2, ABC2/3, ABC2/2, ABC2/1'
# Split data on ', ' and split each token on '/'.
tokens = [token.split('/') for token in data.split(', ')]
# Convert token number to integer.
for index in range(len(tokens)):
tokens[index][1] = int(tokens[index][1])
# Sort pairs, automatically orders lists by items.
tokens.sort()
prev = 0 # Record index of previous pair's name.
indices = [] # List to record indices for output.
for index in range(1, len(tokens)):
# If name matches with previous position.
if tokens[index][0] == tokens[prev][0]:
# Check whether number is increasing sequentially.
if tokens[index][1] != (tokens[index - 1][1] + 1):
# If non-sequential increase then record the indices.
indices.append((prev, index - 1))
prev = index
else:
# If name changes then record the indices.
indices.append((prev, index - 1))
prev = index
# After iterating the list, record the indices.
indices.append((prev, index))
# Print the ranges.
for start, end in indices:
if start == end:
args = (tokens[start][0], tokens[start][1])
print '{0}/{1},'.format(*args),
else:
args = (tokens[start][0], tokens[start][1], tokens[end][1])
print '{0}/{1}-{2},'.format(*args),
# Output:
# ABC1/1-3 ABC2/1-3
I wanted to speedhack this problem, so here is an almost complete solution for you, based on my own make_range_string and a stolen natsort.
import re
from collections import defaultdict
def sortkey_natural(s):
return tuple(int(part) if re.match(r'[0-9]+$', part) else part
for part in re.split(r'([0-9]+)', s))
def natsort(collection):
return sorted(collection, key=sortkey_natural)
def make_range_string(collection):
collection = sorted(collection)
parts = []
range_start = None
previous = None
def push_range(range_start, previous):
if range_start is not None:
if previous == range_start:
parts.append(str(previous))
else:
parts.append("{}-{}".format(range_start, previous))
for i in collection:
if previous != i - 1:
push_range(range_start, previous)
range_start = i
previous = i
push_range(range_start, previous)
return ', '.join(parts)
def make_ranges(strings):
components = defaultdict(list)
for i in strings:
main, _, number = i.partition('/')
components[main].append(int(number))
rvlist = []
for key in natsort(components):
rvlist.append((key, make_range_string(components[key])))
return rvlist
print(make_ranges(['ABC1/3', 'ABC1/1', 'ABC1/2', 'ABC2/5', 'ABC2/2', 'ABC2/1']))
The code prints a list of tuples:
[('ABC1', '1-3'), ('ABC2', '1-2, 5')]
I would start by splitting the strings, and using the part that you want to match on as a dictionary key.
strings = ['ABC1/3', 'ABC1/1', 'ABC1/2', 'ABC2/3', 'ABC2/2', 'ABC2/1']
d = {}
for s in string:
a, b = s.split('/')
d.get(a, default=[]).append(b)
That collects the number parts into a list for each prefix. Then you can sort the lists and look for adjacent numbers to join.

Python: How to replace N random string occurrences in text?

Say that I have 10 different tokens, "(TOKEN)" in a string. How do I replace 2 of those tokens, chosen at random, with some other string, leaving the other tokens intact?
>>> import random
>>> text = '(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)'
>>> token = '(TOKEN)'
>>> replace = 'foo'
>>> num_replacements = 2
>>> num_tokens = text.count(token) #10 in this case
>>> points = [0] + sorted(random.sample(range(1,num_tokens+1),num_replacements)) + [num_tokens+1]
>>> replace.join(token.join(text.split(token)[i:j]) for i,j in zip(points,points[1:]))
'(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__foo__(TOKEN)__foo__(TOKEN)__(TOKEN)__(TOKEN)'
In function form:
>>> def random_replace(text, token, replace, num_replacements):
num_tokens = text.count(token)
points = [0] + sorted(random.sample(range(1,num_tokens+1),num_replacements)) + [num_tokens+1]
return replace.join(token.join(text.split(token)[i:j]) for i,j in zip(points,points[1:]))
>>> random_replace('....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....','(TOKEN)','FOO',2)
'....FOO....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....FOO....'
Test:
>>> for i in range(0,9):
print random_replace('....(0)....(0)....(0)....(0)....(0)....(0)....(0)....(0)....','(0)','(%d)'%i,i)
....(0)....(0)....(0)....(0)....(0)....(0)....(0)....(0)....
....(0)....(0)....(0)....(0)....(1)....(0)....(0)....(0)....
....(0)....(0)....(0)....(0)....(0)....(2)....(2)....(0)....
....(3)....(0)....(0)....(3)....(0)....(3)....(0)....(0)....
....(4)....(4)....(0)....(0)....(4)....(4)....(0)....(0)....
....(0)....(5)....(5)....(5)....(5)....(0)....(0)....(5)....
....(6)....(6)....(6)....(0)....(6)....(0)....(6)....(6)....
....(7)....(7)....(7)....(7)....(7)....(7)....(0)....(7)....
....(8)....(8)....(8)....(8)....(8)....(8)....(8)....(8)....
If you need exactly two, then:
Detect the tokens (keep some links to them, like index into the string)
Choose two at random (random.choice)
Replace them
What are you trying to do, exactly? A good answer will depend on that...
That said, a brute-force solution that comes to mind is to:
Store the 10 tokens in an array, such that tokens[0] is the first token, tokens[1] is the second, ... and so on
Create a dictionary to associate each unique "(TOKEN)" with two numbers: start_idx, end_idx
Write a little parser that walks through your string and looks for each of the 10 tokens. Whenever one is found, record the start/end indexes (as start_idx, end_idx) in the string where that token occurs.
Once done parsing, generate a random number in the range [0,9]. Lets call this R
Now, your random "(TOKEN)" is tokens[R];
Use the dictionary in step (3) to find the start_idx, end_idx values in the string; replace the text there with "some other string"
My solution in code:
import random
s = "(TOKEN)test(TOKEN)fgsfds(TOKEN)qwerty(TOKEN)42(TOKEN)(TOKEN)ttt"
replace_from = "(TOKEN)"
replace_to = "[REPLACED]"
amount_to_replace = 2
def random_replace(s, replace_from, replace_to, amount_to_replace):
parts = s.split(replace_from)
indices = random.sample(xrange(len(parts) - 1), amount_to_replace)
replaced_s_parts = list()
for i in xrange(len(parts)):
replaced_s_parts.append(parts[i])
if i < len(parts) - 1:
if i in indices:
replaced_s_parts.append(replace_to)
else:
replaced_s_parts.append(replace_from)
return "".join(replaced_s_parts)
#TEST
for i in xrange(5):
print random_replace(s, replace_from, replace_to, 2)
Explanation:
Splits string into several parts using replace_from
Chooses indexes of tokens to replace using random.sample. This returned list contains unique numbers
Build a list for string reconstruction, replacing tokens with generated index by replace_to.
Concatenate all list elements into single string
Try this solution:
import random
def replace_random(tokens, eqv, n):
random_tokens = eqv.keys()
random.shuffle(random_tokens)
for i in xrange(n):
t = random_tokens[i]
tokens = tokens.replace(t, eqv[t])
return tokens
Assuming that a string with tokens exists, and a suitable equivalence table can be constructed with a replacement for each token:
tokens = '(TOKEN1) (TOKEN2) (TOKEN3) (TOKEN4) (TOKEN5) (TOKEN6) (TOKEN7) (TOKEN8) (TOKEN9) (TOKEN10)'
equivalences = {
'(TOKEN1)' : 'REPLACEMENT1',
'(TOKEN2)' : 'REPLACEMENT2',
'(TOKEN3)' : 'REPLACEMENT3',
'(TOKEN4)' : 'REPLACEMENT4',
'(TOKEN5)' : 'REPLACEMENT5',
'(TOKEN6)' : 'REPLACEMENT6',
'(TOKEN7)' : 'REPLACEMENT7',
'(TOKEN8)' : 'REPLACEMENT8',
'(TOKEN9)' : 'REPLACEMENT9',
'(TOKEN10)' : 'REPLACEMENT10'
}
You can call it like this:
replace_random(tokens, equivalences, 2)
> '(TOKEN1) REPLACEMENT2 (TOKEN3) (TOKEN4) (TOKEN5) (TOKEN6) (TOKEN7) (TOKEN8) REPLACEMENT9 (TOKEN10)'
There are lots of ways to do this. My approach would be to write a function that takes the original string, the token string, and a function that returns the replacement text for an occurrence of the token in the original:
def strByReplacingTokensUsingFunction(original, token, function):
outputComponents = []
matchNumber = 0
unexaminedOffset = 0
while True:
matchOffset = original.find(token, unexaminedOffset)
if matchOffset < 0:
matchOffset = len(original)
outputComponents.append(original[unexaminedOffset:matchOffset])
if matchOffset == len(original):
break
unexaminedOffset = matchOffset + len(token)
replacement = function(original=original, offset=matchOffset, matchNumber=matchNumber, token=token)
outputComponents.append(replacement)
matchNumber += 1
return ''.join(outputComponents)
(You could certainly change this to use shorter identifiers. My style is somewhat more verbose than typical Python style.)
Given that function, it's easy to replace two random occurrences out of ten. Here's some sample input:
sampleInput = 'a(TOKEN)b(TOKEN)c(TOKEN)d(TOKEN)e(TOKEN)f(TOKEN)g(TOKEN)h(TOKEN)i(TOKEN)j(TOKEN)k'
The random module has a handy method for picking random items from a population (not picking the same item twice):
import random
replacementIndexes = random.sample(range(10), 2)
Then we can use the function above to replace the randomly-chosen occurrences:
sampleOutput = strByReplacingTokensUsingFunction(sampleInput, '(TOKEN)',
(lambda matchNumber, token, **keywords:
'REPLACEMENT' if (matchNumber in replacementIndexes) else token))
print sampleOutput
And here's some test output:
a(TOKEN)b(TOKEN)cREPLACEMENTd(TOKEN)e(TOKEN)fREPLACEMENTg(TOKEN)h(TOKEN)i(TOKEN)j(TOKEN)k
Here's another run:
a(TOKEN)bREPLACEMENTc(TOKEN)d(TOKEN)e(TOKEN)f(TOKEN)gREPLACEMENTh(TOKEN)i(TOKEN)j(TOKEN)k
from random import sample
mystr = 'adad(TOKEN)hgfh(TOKEN)hjgjh(TOKEN)kjhk(TOKEN)jkhjk(TOKEN)utuy(TOKEN)tyuu(TOKEN)tyuy(TOKEN)tyuy(TOKEN)tyuy(TOKEN)'
def replace(mystr, substr, n_repl, replacement='XXXXXXX', tokens=10, index=0):
choices = sorted(sample(xrange(tokens),n_repl))
for i in xrange(choices[-1]+1):
index = mystr.index(substr, index) + 1
if i in choices:
mystr = mystr[:index-1] + mystr[index-1:].replace(substr,replacement,1)
return mystr
print replace(mystr,'(TOKEN)',2)

Finding items that occur exactly once in an array

I have an 2 dimensional array. Each of the row vectors, in this case, is considered a quantity of interest. What I want to do is return all the rows that appear exactly once as one array, and all the rows that appear more than once as a second array.
For example, if the array was:
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
I would like to return two arrays:
nonsingles=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [5,1,6,0]]
singles= [[3,2,1,0], [4,4,1,0]]
It is important that the order stay preserved. The code I have written to do this is as follows:
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
arrayhash=map(tuple, array)
for x in arrayhash:
if (arrayhash.count(x)==1):
singles.append(x)
if (arrayhash.count(x)>1):
nonsingles.append(x)
nonsingles=array(nonsingles)
singles=array(singles)
return {'singles':singles, 'nonsingles':nonsingles}
Now, I am happy to say that this works, but unhappy to say that it is extremely slow, as a typical array i have is 30000(rows)x10 elements/row=300000 elements. Can anyone give me some tips about how to speed this up?? I apologize if this question is very simple, I am new to Python. Also, I am using Numpy/Scipy with Python 2.7, if that is any help.
In Python 2.7 or above, you can use collections.Counter to count the number of occurrences:
def unique_items(iterable):
tuples = map(tuple, iterable)
counts = collections.Counter(tuples)
unique = []
non_unique = []
for t in tuples:
if counts[t] == 1:
unique.append(t)
else:
non_unique.append(t)
return unique, non_unique
I think your problem is that you are doing an in test on a list. This has O(n) performance.
It should be faster to build a dict and then use that to figure out what to do with each row.
EDIT: The code had an unnecessary enumerate() in it; I stripped it out.
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
return {'singles':singles, 'nonsingles':nonsingles}
Here's a version that only returns unique rows:
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
already_seen = set()
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if row in already_seen:
continue
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
already_seen.add(row)
return {'singles':singles, 'nonsingles':nonsingles}
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
x = singles_nonsingles(a)
print("Array: " + str(a))
print(x)
The first return only the list of the single/no single arrays without repetitions, the second with repetitions
def comp (multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] > 1:
no_singles.append(list(k))
elif res[k] == 1:
singles.append(list(k))
return singles, no_singles
def count_w_repetitions(multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] == 1:
singles.append(list(k))
else:
for i in xrange(res[k]):
no_singles.append(list(k))
return singles, no_singles
from itertools import compress,imap
def has_all_unique(a):
return len(a) == len(frozenset(a))
uniq = map( has_all_unique,a)
singles = list(compress(a,uniq))
notuniq = imap(lambda x: not x,uniq)
nonsingles = list(compress(a,notuniq))

Categories