What I thought would be a fairly simple OCR task is turning out to be more challenging.
Here is the original image, which is a grayscale table of numbers that is tab-delimited (most likely):
And here is my attempt:
from pathlib import Path
from PIL import Image
import pytesseract
import cv2
image_path = Path("table.png")
img = cv2.imread(str(image_path))
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite(str(Path("removed_noise1.png")), img)
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
cv2.imwrite(str(Path("thresh1.png")), img)
img = cv2.resize(img, (2500, 1200))
cv2.imwrite(str(Path("resize1.png")), img)
custom_config = r"--psm 6 -c tessedit_char_whitelist=0123456789"
text = pytesseract.image_to_string(Image.open(str(Path("resize1.png"))), config=custom_config)
print(text)
>>>
18 25 8 19 6 7 5 11 2 1 0 0 0 1 2 2 1 3 4 3
58 37 45 5942 4441 50 25 2 3 4 1 1 2 2 1 3 4 3
20 15 32 18 33 32 34 26 31 6 14 13 7 2 2 2 3 3 4 3
4 3 11 3 13 12 13 9 20 9 17 17 12 5 3 4 6 3 4 4
0 0 3 1 4 4 4 3 11 9 15 15 13 8 5 5 7 3 4 4
0 1 0 1 1 2 1 5 9 12 12 13 9 6 6 8 4 4 4
0 0 0 0 1 0 1 0 3 8 9 9 11 9 7 6 8 4 4 5
0 0 0 0 0 0 0 0 2 7 7 7 9 9 7 7 8 5 4 5
0 0 0 0 0 0 0 1 6 5 5 7 8 7 7 8 5 4 5
0 0 0 0 0 0 0 0 0 6 4 4 6 7 7 6 7 5 4 5
0 0 0 0 0 0 1 0 0 5 3 3 5 7 6 6 6 5 4 5
0 0 0 0 0 0 0 0 0 5 2 2 4 6 6 6 6 5 5 5
10 0 0 0 0 0 0 0 4 2 2 3 5 5 3 5 5 5 5
0 0 0 0 0 0 0 0 0 4 1 2 2 4 5 5 4 5 5 5
0 0 0 0 0 0 0 0 0 3 1 2 2 4 5 4 5 5 5
0 0 0 0 0 10 0 0 3 1 1 1 3 4 4 3 5 4 4
0 0 0 0 0 0 0 0 0 3 1 1 3 4 4 3 5 4 4
0 0 0 0 0 0 0 0 0 2 1 1 1 2 3 4 3 4 4 4
0 0 0 0 0 0 10 0 0 2 0 0 1 2 3 3 2 4 4 4
0 0 0 0 0 0 0 0 0 2 0 0 1 2 3 3 2 4 4 4
0 0 0 0 0 0 0 0 0 2 0 0 0 1 3 3 2 4 4 4
0 0 0 0 0 0 10 0 0 1 0 0 0 1 2 2 1 4 4 4
0 0 0 0 0 0 0 0 0 1 0 0 0 1 2 2 1 4 4 3
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 3 4 3
A lot of zeros are being turned into tens, and for example the second row, second column number (57) is processed as 37.
I've tried different dimensions to no avail, but I obviously haven't exhausted all possibilities.
Related
Say I had a dataframe column of ones and zeros, and I wanted to group by clusters of where the value is 1. Using groupby would ordinarily render 2 groups, a single group of zeros, and a single group of ones.
df = pd.DataFrame([1,1,1,0,0,0,0,1,1,0,0,0,1,0,1,1,1],columns=['clusters'])
print df
clusters
0 1
1 1
2 1
3 0
4 0
5 0
6 0
7 1
8 1
9 0
10 0
11 0
12 1
13 0
14 1
15 1
16 1
for k, g in df.groupby(by=df.clusters):
print k, g
0 clusters
3 0
4 0
5 0
6 0
9 0
10 0
11 0
13 0
1 clusters
0 1
1 1
2 1
7 1
8 1
12 1
14 1
15 1
16 1
So in effect, I need to have a new column with a unique identifier for all clusters of 1: hence we would end up with:
clusters unique
0 1 1
1 1 1
2 1 1
3 0 0
4 0 0
5 0 0
6 0 0
7 1 2
8 1 2
9 0 0
10 0 0
11 0 0
12 1 3
13 0 0
14 1 4
15 1 4
16 1 4
Any help welcome. Thanks.
Let us do ngroup
m = df['clusters'].eq(0)
df['unqiue'] = df.groupby(m.cumsum()[~m]).ngroup() + 1
clusters unqiue
0 1 1
1 1 1
2 1 1
3 0 0
4 0 0
5 0 0
6 0 0
7 1 2
8 1 2
9 0 0
10 0 0
11 0 0
12 1 3
13 0 0
14 1 4
15 1 4
16 1 4
Using a mask:
m = df['clusters'].eq(0)
df['unique'] = m.ne(m.shift()).mask(m, False).cumsum().mask(m, 0)
output:
clusters unique
0 1 1
1 1 1
2 1 1
3 0 0
4 0 0
5 0 0
6 0 0
7 1 2
8 1 2
9 0 0
10 0 0
11 0 0
12 1 3
13 0 0
14 1 4
15 1 4
16 1 4
I want to read volume value from mp3 file (filename.mp3) and not by recording the audio file as in this example :
import sounddevice as sd
import numpy as np
def print_sound(indata, outdata, frames, time, status):
volume_norm = np.linalg.norm(indata)*10
print (int(volume_norm))
with sd.Stream(callback=print_sound):
sd.sleep(10000)
output :
1
1
1
0
1
1
1
1
0
0
0
0
0
17
24
8
5
15
18
16
6
2
3
5
10
8
5
1
0
0
2
4
3
1
0
0
0
1
3
4
2
0
0
2
2
4
4
3
0
0
2
2
5
3
0
0
0
0
3
3
1
0
0
0
0
0
1
1
1
1
1
0
0
0
0
0
0
0
1
2
2
2
2
2
2
3
4
3
3
7
13
4
4
3
5
6
3
2
3
3
4
6
6
6
4
3
3
2
3
6
6
8
12
15
1
0
0
1
12
19
2
4
3
6
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
5
0
0
2
3
0
0
0
0
0
5
5
17
4
6
3
4
5
16
10
7
31
5
1
0
0
0
0
3
3
1
0
0
0
0
0
0
0
0
0
0
1
7
0
2
5
20
5
6
5
29
12
4
7
2
0
1
5
13
51
5
9
44
7
3
3
4
4
4
1
1
1
1
110
71
0
0
48
23
28
4
0
0
0
0
0
74
53
37
29
26
15
17
14
7
5
5
6
6
6
6
7
7
7
7
7
7
7
7
8
8
8
7
7
6
6
6
6
6
6
4
53
47
18
13
9
8
8
7
5
4
4
4
4
5
6
6
6
5
4
3
3
3
2
3
2
3
3
3
3
3
3
4
4
4
5
5
5
6
7
7
8
7
18
8
2
2
4
Try this:
install: pip install ffmpegio
then the following code should do what you want:
import ffmpegio
import numpy as np
with ffmpegio.open(mp3file, 'ra', blocksize=frames, sample_fmt='dbl') as file:
for i, indata in enumerate(file):
volume_norm = np.linalg.norm(indata)*10
n0 = i*frames # starting sample index
t = np.range(n0,n0+volume_norm.shape[0])/file.sample_rate
print (int(volume_norm))
sample_fmt='dbl' argument makes indata to be float data type. If you want to keep the original sample type, remove the argument.
I'm the dev of ffmpegio library. Let me know if you encounter any issues, and I'll fix'em asap.
I'm having trouble randomly splitting DataFrame df into groups of smaller DataFrames.
df
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
0 1 5 4 0 4 4 0 0 0 4 0 0 21
1 2 3 0 0 3 0 0 0 0 0 0 0 6
2 3 4 0 0 0 0 0 0 0 0 0 0 4
3 4 3 0 0 0 0 5 0 0 4 0 5 17
4 5 3 0 0 0 0 0 0 0 0 0 0 3
5 6 5 0 0 0 0 0 0 5 0 0 0 10
6 7 4 0 0 0 2 5 3 4 4 0 0 22
7 8 1 0 0 0 4 5 0 0 0 4 0 14
8 9 5 0 0 0 4 5 0 0 4 5 0 23
9 10 3 2 0 0 0 4 0 0 0 0 0 9
10 11 2 0 4 0 0 3 3 0 4 2 0 18
11 12 5 0 0 0 4 5 0 0 5 2 0 21
12 13 5 4 0 0 2 0 0 0 3 0 0 14
13 14 5 4 0 0 5 0 0 0 0 0 0 14
14 15 5 0 0 0 3 0 0 0 0 5 5 18
15 16 5 0 0 0 0 0 0 0 4 0 0 9
16 17 3 0 0 4 0 0 0 0 0 0 0 7
17 18 4 0 0 0 0 0 0 0 0 0 0 4
18 19 5 3 0 0 4 0 0 0 0 0 0 12
19 20 4 0 0 0 0 0 0 0 0 0 0 4
20 21 1 0 0 3 3 0 0 0 0 0 0 7
21 22 4 0 0 0 3 5 5 0 5 4 0 26
22 23 4 0 0 0 4 3 0 0 5 0 0 16
23 24 3 0 0 4 0 0 0 0 0 3 0 10
I've tried sample and arange, but with bad results.
ran1 = df.sample(frac=0.2, replace=False, random_state=1)
ran2 = df.sample(frac=0.2, replace=False, random_state=1)
ran3 = df.sample(frac=0.2, replace=False, random_state=1)
ran4 = df.sample(frac=0.2, replace=False, random_state=1)
ran5 = df.sample(frac=0.2, replace=False, random_state=1)
print(ran1, '\n')
print(ran2, '\n')
print(ran3, '\n')
print(ran4, '\n')
print(ran5, '\n')
This turned out to be 5 exact same DataFrames.
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
13 14 5 4 0 0 5 0 0 0 0 0 0 14
18 19 5 3 0 0 4 0 0 0 0 0 0 12
3 4 3 0 0 0 0 5 0 0 4 0 5 17
14 15 5 0 0 0 3 0 0 0 0 5 5 18
20 21 1 0 0 3 3 0 0 0 0 0 0 7
Also I've tried :
g = df.groupby(['movie_id'])
h = np.arange(g.ngroups)
np.random.shuffle(h)
df[g.ngroup().isin(h[:6])]
The output :
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
4 5 3 0 0 0 0 0 0 0 0 0 0 3
6 7 4 0 0 0 2 5 3 4 4 0 0 22
7 8 1 0 0 0 4 5 0 0 0 4 0 14
16 17 3 0 0 4 0 0 0 0 0 0 0 7
17 18 4 0 0 0 0 0 0 0 0 0 0 4
18 19 5 3 0 0 4 0 0 0 0 0 0 12
But there's still only one smaller group, other datas from df aren't grouped.
I'm expecting the smaller groups to be split evenly by using percentage. And the whole df should be split into groups.
Use np.array_split
shuffled = df.sample(frac=1)
result = np.array_split(shuffled, 5)
df.sample(frac=1) shuffle the rows of df. Then use np.array_split split it into parts that have equal size.
It gives you:
for part in result:
print(part,'\n')
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
5 6 5 0 0 0 0 0 0 5 0 0 0 10
4 5 3 0 0 0 0 0 0 0 0 0 0 3
7 8 1 0 0 0 4 5 0 0 0 4 0 14
16 17 3 0 0 4 0 0 0 0 0 0 0 7
22 23 4 0 0 0 4 3 0 0 5 0 0 16
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
13 14 5 4 0 0 5 0 0 0 0 0 0 14
14 15 5 0 0 0 3 0 0 0 0 5 5 18
21 22 4 0 0 0 3 5 5 0 5 4 0 26
1 2 3 0 0 3 0 0 0 0 0 0 0 6
20 21 1 0 0 3 3 0 0 0 0 0 0 7
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
10 11 2 0 4 0 0 3 3 0 4 2 0 18
9 10 3 2 0 0 0 4 0 0 0 0 0 9
11 12 5 0 0 0 4 5 0 0 5 2 0 21
8 9 5 0 0 0 4 5 0 0 4 5 0 23
12 13 5 4 0 0 2 0 0 0 3 0 0 14
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
18 19 5 3 0 0 4 0 0 0 0 0 0 12
3 4 3 0 0 0 0 5 0 0 4 0 5 17
0 1 5 4 0 4 4 0 0 0 4 0 0 21
23 24 3 0 0 4 0 0 0 0 0 3 0 10
6 7 4 0 0 0 2 5 3 4 4 0 0 22
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
17 18 4 0 0 0 0 0 0 0 0 0 0 4
2 3 4 0 0 0 0 0 0 0 0 0 0 4
15 16 5 0 0 0 0 0 0 0 4 0 0 9
19 20 4 0 0 0 0 0 0 0 0 0 0 4
A simple demo:
df = pd.DataFrame({"movie_id": np.arange(1, 25),
"borda": np.random.randint(1, 25, size=(24,))})
n_split = 5
# the indices used to select parts from dataframe
ixs = np.arange(df.shape[0])
np.random.shuffle(ixs)
# np.split cannot work when there is no equal division
# so we need to find out the split points ourself
# we need (n_split-1) split points
split_points = [i*df.shape[0]//n_split for i in range(1, n_split)]
# use these indices to select the part we want
for ix in np.split(ixs, split_points):
print(df.iloc[ix])
The result:
borda movie_id
8 3 9
10 2 11
22 14 23
7 14 8
borda movie_id
0 16 1
20 4 21
17 15 18
15 1 16
6 6 7
borda movie_id
9 9 10
19 4 20
5 1 6
16 23 17
21 20 22
borda movie_id
11 24 12
23 5 24
1 22 2
12 7 13
18 15 19
borda movie_id
3 11 4
14 10 15
2 6 3
4 7 5
13 21 14
IIUC, you can do this:
frames={}
for e,i in enumerate(np.split(df,6)):
frames.update([('df_'+str(e+1),pd.DataFrame(np.random.permutation(i),columns=df.columns))])
print(frames['df_1'])
movie_id 1 2 4 5 6 7 8 9 10 11 12 borda
0 4 3 0 0 0 0 5 0 0 4 0 5 17
1 3 4 0 0 0 0 0 0 0 0 0 0 4
2 2 3 0 0 3 0 0 0 0 0 0 0 6
3 1 5 4 0 4 4 0 0 0 4 0 0 21
Explanation: np.split(df,6) splits the df to 6 equal size.
pd.DataFrame(np.random.permutation(i),columns=df.columns) randomly reshapes the rows so creating a dataframe with this information and storing in a dictionary names frames.
Finally print the dictionary by calling each keys, values as dataframe will be returned. you can try print frames['df_1'] , frames['df_2'] , etc. It will return random permutations of a split of the dataframe.
Simple enough but the formatting of the data is giving me a headache:
I have the sales data of items for each day of the week. Each row is a different item, and the number corresponds to how many is sold in that day.
I want to make a mosaic plot to see the distribution of item sales across days (so seven 'columns', one for each day). I'm using statsmodel.mosaic.
How should I reformat this? It's already a dataframe, and I'm using the mosaic(...) function.
0 Fri Mon Sat Sun Thu Tue Wed
0 2 3 3 1 2 3 4
1 2 2 0 1 1 3 1
2 0 0 1 0 2 0 2
3 4 1 3 0 6 1 4
4 0 1 0 0 1 6 0
5 0 0 0 0 0 0 0
6 14 8 13 9 14 25 40
7 3 11 4 4 5 7 7
8 16 24 20 18 22 32 41
9 0 0 0 0 0 0 0
10 0 1 8 6 1 0 8
11 0 1 2 2 3 4 3
12 10 5 3 7 11 13 22
13 3 10 2 5 6 6 15
14 5 4 7 7 12 10 17
15 0 6 0 2 1 3 3
16 1 0 0 1 8 2 6
17 3 6 7 4 7 15 24
18 0 0 0 1 1 0 3
19 0 0 0 0 0 1 0
20 0 1 0 0 0 0 0
21 6 3 2 4 10 19 14
22 2 5 4 2 4 12 9
23 7 6 5 8 12 9 11
24 0 2 0 0 0 2 2
25 0 2 0 0 1 3 1
26 0 2 0 0 0 1 0
27 0 0 0 0 0 3 0
28 5 0 2 3 1 2 5
29 0 0 0 0 0 0 1
30 0 0 0 0 0 0 1
31 4 4 2 0 5 7 4
32 2 3 4 1 7 3 5
33 1 1 1 0 4 2 2
34 0 0 0 0 0 0 1
35 1 3 3 3 8 6 13
36 1 2 0 0 2 6 6
37 2 5 0 3 0 7 2
38 0 0 0 0 1 0 0
39 1 2 2 0 0 4 3
40 1 4 0 3 1 8 5
41 2 1 2 1 2 1 3
42 0 0 0 0 0 0 0
Problem Summary
In the following example, my NMT model has high loss because it correctly predicts target_input instead of target_output.
Targetin : 1 3 3 3 3 6 6 6 9 7 7 7 4 4 4 4 4 9 9 10 10 10 3 3 10 10 3 10 3 3 10 10 3 9 9 4 4 4 4 4 3 10 3 3 9 9 3 6 6 6 6 6 6 10 9 9 10 10 4 4 4 4 4 4 4 4 4 4 4 4 9 9 9 9 3 3 3 6 6 6 6 6 9 9 10 3 4 4 4 4 4 4 4 4 4 4 4 4 9 9 10 3 10 9 9 3 4 4 4 4 4 4 4 4 4 10 10 4 4 4 4 4 4 4 4 4 4 9 9 10 3 6 6 6 6 3 3 3 10 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 9 9 3 3 10 6 6 6 6 6 3 9 9 3 3 3 3 3 3 3 10 10 3 9 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 9 3 6 6 6 6 6 6 3 5 3 3 3 3 10 10 10 3 9 9 5 10 3 3 3 3 9 9 9 5 10 10 10 10 10 4 4 4 4 3 10 6 6 6 6 6 6 3 5 10 10 10 10 3 9 9 6 6 6 6 6 6 6 6 6 9 9 9 3 3 3 6 6 6 6 6 6 6 6 3 9 9 9 3 3 6 6 6 3 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Targetout : 3 3 3 3 6 6 6 9 7 7 7 4 4 4 4 4 9 9 10 10 10 3 3 10 10 3 10 3 3 10 10 3 9 9 4 4 4 4 4 3 10 3 3 9 9 3 6 6 6 6 6 6 10 9 9 10 10 4 4 4 4 4 4 4 4 4 4 4 4 9 9 9 9 3 3 3 6 6 6 6 6 9 9 10 3 4 4 4 4 4 4 4 4 4 4 4 4 9 9 10 3 10 9 9 3 4 4 4 4 4 4 4 4 4 10 10 4 4 4 4 4 4 4 4 4 4 9 9 10 3 6 6 6 6 3 3 3 10 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 9 9 3 3 10 6 6 6 6 6 3 9 9 3 3 3 3 3 3 3 10 10 3 9 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 9 3 6 6 6 6 6 6 3 5 3 3 3 3 10 10 10 3 9 9 5 10 3 3 3 3 9 9 9 5 10 10 10 10 10 4 4 4 4 3 10 6 6 6 6 6 6 3 5 10 10 10 10 3 9 9 6 6 6 6 6 6 6 6 6 9 9 9 3 3 3 6 6 6 6 6 6 6 6 3 9 9 9 3 3 6 6 6 3 3 3 3 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Prediction : 3 3 3 3 3 6 6 6 9 7 7 7 4 4 4 4 4 9 3 3 3 3 3 3 10 3 3 10 3 3 10 3 3 9 3 4 4 4 4 4 3 10 3 3 9 3 3 6 6 6 6 6 6 10 9 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 9 3 3 3 3 3 3 6 6 6 6 6 9 6 3 3 4 4 4 4 4 4 4 4 4 4 4 4 9 3 3 3 10 9 3 3 4 4 4 4 4 4 4 4 4 3 10 4 4 4 4 4 4 4 4 4 4 9 3 3 3 6 6 6 6 3 3 3 10 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 9 3 3 3 10 6 6 6 6 6 3 9 3 3 3 3 3 3 3 3 3 3 3 9 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 9 3 6 6 6 6 6 6 3 3 3 3 3 3 10 3 3 3 9 3 3 10 3 3 3 3 9 3 9 3 10 3 3 3 3 4 4 4 4 3 10 6 6 6 6 6 6 3 3 10 3 3 3 3 9 3 6 6 6 6 6 6 6 6 6 9 6 9 3 3 3 6 6 6 6 6 6 6 6 3 9 3 9 3 3 6 6 6 3 3 3 3 3 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
Source : 9 16 4 7 22 22 19 1 12 19 12 18 5 18 9 18 5 8 12 19 19 5 5 19 22 7 12 12 6 19 7 3 20 7 9 14 4 11 20 12 7 1 18 7 7 5 22 9 13 22 20 19 7 19 7 13 7 11 19 20 6 22 18 17 17 1 12 17 23 7 20 1 13 7 11 11 22 7 12 1 13 12 5 5 19 22 5 5 20 1 5 4 12 9 7 12 8 14 18 22 18 12 18 17 19 4 19 12 11 18 5 9 9 5 14 7 11 6 4 17 23 6 4 5 12 6 7 14 4 20 6 8 12 25 4 19 6 1 5 1 5 20 4 18 12 12 1 11 12 1 25 13 18 19 7 12 7 3 4 22 9 9 12 4 8 9 19 9 22 22 19 1 19 7 5 19 4 5 18 11 13 9 4 14 12 13 20 11 12 11 7 6 1 11 19 20 7 22 22 12 22 22 9 3 8 12 11 14 16 4 11 7 11 1 8 5 5 7 18 16 22 19 9 20 4 12 18 7 19 7 1 12 18 17 12 19 4 20 9 9 1 12 5 18 14 17 17 7 4 13 16 14 12 22 12 22 18 9 12 11 3 18 6 20 7 4 20 7 9 1 7 25 13 5 25 14 11 5 20 7 23 12 5 16 19 19 25 19 7 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
As is evident, the prediction matches up almost 100% with target_input instead of target_output, as it should (off-by-one). Loss and gradients are being calculated using target_output, so it is strange that predictions are matching up to target_input.
Model Overview
An NMT model predicts a sequence of words in a target language using a primary sequence of words in a source language. This is the framework behind Google Translate. Since NMT uses coupled-RNNs, it is supervised and required labelled target input and output.
NMT uses a source sequence, a target_input sequence, and a target_output sequence. In the example below, the encoder RNN (blue) uses the source input words to produce a meaning vector, which it passes to the decoder RNN (red), which uses the meaning vector to produce output.
When doing new predictions (inference), the decoder RNN uses its own previous output to seed the next prediction in the timestep. However, to improve training, it is allowed to seed itself with the correct previous prediction at each new timestep. This is why target_input is necessary for training.
Code to get an iterator with source, target_in, target_out
def get_batched_iterator(hparams, src_loc, tgt_loc):
if not (os.path.exists('primary.csv') and os.path.exists('secondary.csv')):
utils.integerize_raw_data()
source_dataset = tf.data.TextLineDataset(src_loc)
target_dataset = tf.data.TextLineDataset(tgt_loc)
dataset = tf.data.Dataset.zip((source_dataset, target_dataset))
dataset = dataset.shuffle(hparams.shuffle_buffer_size, seed=hparams.shuffle_seed)
dataset = dataset.map(lambda source, target: (tf.string_to_number(tf.string_split([source], delimiter=',').values, tf.int32),
tf.string_to_number(tf.string_split([target], delimiter=',').values, tf.int32)))
dataset = dataset.map(lambda source, target: (source, tf.concat(([hparams.sos], target), axis=0), tf.concat((target, [hparams.eos]), axis=0)))
dataset = dataset.map(lambda source, target_in, target_out: (source, target_in, target_out, tf.size(source), tf.size(target_in)))
# Proceed to batch and return iterator
NMT model core code
def __init__(self, hparams, iterator, mode):
source, target_in, target_out, source_lengths, target_lengths = iterator.get_next()
# Lookup embeddings
embedding_encoder = tf.get_variable("embedding_encoder", [hparams.src_vsize, hparams.src_emsize])
encoder_emb_inp = tf.nn.embedding_lookup(embedding_encoder, source)
embedding_decoder = tf.get_variable("embedding_decoder", [hparams.tgt_vsize, hparams.tgt_emsize])
decoder_emb_inp = tf.nn.embedding_lookup(embedding_decoder, target_in)
# Build and run Encoder LSTM
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hparams.num_units)
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder_cell, encoder_emb_inp, sequence_length=source_lengths, dtype=tf.float32)
# Build and run Decoder LSTM with TrainingHelper and output projection layer
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(hparams.num_units)
projection_layer = layers_core.Dense(hparams.tgt_vsize, use_bias=False)
helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb_inp, sequence_length=target_lengths)
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, encoder_state, output_layer=projection_layer)
outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder)
logits = outputs.rnn_output
if mode is 'TRAIN' or mode is 'EVAL': # then calculate loss
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_out, logits=logits)
target_weights = tf.sequence_mask(target_lengths, maxlen=tf.shape(target_out)[1], dtype=logits.dtype)
self.loss = tf.reduce_sum((crossent * target_weights) / hparams.batch_size)
if mode is 'TRAIN': # then calculate/clip gradients, then optimize model
params = tf.trainable_variables()
gradients = tf.gradients(self.loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, hparams.max_gradient_norm)
optimizer = tf.train.AdamOptimizer(hparams.l_rate)
self.update_step = optimizer.apply_gradients(zip(clipped_gradients, params))
if mode is 'EVAL': # then allow access to input/output tensors to printout
self.src = source
self.tgt_in = target_in
self.tgt_out = target_out
self.logits = logits
The core issue with the NMT model used to predict a language-like syntax with a repetitive structure is that it becomes incentivized to simply predict whatever the past prediction was. Since it is fed the correct previous prediction at each step by TrainingHelper to speed up training, this artificially produces a local minimum that the model is unable to get out of.
The best option I have found is to weight the loss functions such the key points in the output sequence where the output is not repetitive are weighted more heavily. This will incentivize the model to get those correct, and not just repeat
the past prediction.