How to interpolate a 2D curve in Python - python

I have a set of x & y coordinate which is a curve / shape, I want the smooth the curve / sharp and plot a graph.
I tried different interpolation to smooth the curve / shape, But it still cannot fit my expectation. Using point to draw a smooth curve / shape.
Like the following, using x, y point to get a smooth circle / curve
However, I get something like
circle.jpg
curve.jpg
square.jpg
I also get trouble on spline interpolation, and rbf interpolation.
for cubic_spline_interpolation, I got
ValueError: Error on input data
for univariate_spline_interpolated, I got
ValueError: x must be strictly increasing
for rbf, I got
numpy.linalg.linalg.LinAlgError: Matrix is singular.
I have on idea to fix them and get correct sharp and curve. Many thanks for help.
Edit
For those cannot download the source code and x, y coordinate file, I post the code and x, y coordinate in question.
The following is my code:
#!/usr/bin/env python3
from std_lib import *
import os
import numpy as np
import cv2
from scipy import interpolate
import matplotlib.pyplot as plt
CUR_DIR = os.getcwd()
CIRCLE_FILE = "circle.txt"
CURVE_FILE = "curve.txt"
SQUARE_FILE = "square.txt"
#test
CIRCLE_NAME = "circle"
CURVE_NAME = "curve"
SQUARE_NAME = "square"
SYS_TOKEN_CNT = 2 # x, y
total_pt_cnt = 0 # total no. of points
x_arr = np.array([]) # x position set
y_arr = np.array([]) # y position set
def convert_coord_to_array(file_path):
global total_pt_cnt
global x_arr
global y_arr
if file_path == "":
return FALSE
with open(file_path) as f:
content = f.readlines()
content = [x.strip() for x in content]
total_pt_cnt = len(content)
if (total_pt_cnt <= 0):
return FALSE
##
x_arr = np.empty((0, total_pt_cnt))
y_arr = np.empty((0, total_pt_cnt))
#compare the first and last x
# if ((content[0][0]) > (content[-1])):
# is_reverse = TRUE
for x in content:
token_cnt = get_token_cnt(x, ',')
if (token_cnt != SYS_TOKEN_CNT):
return FALSE
for idx in range(token_cnt):
token_string = get_token_string(x, ',', idx)
token_string = token_string.strip()
if (not token_string.isdigit()):
return FALSE
# save x, y set
if (idx == 0):
x_arr = np.append(x_arr, int(token_string))
else:
y_arr = np.append(y_arr, int(token_string))
return TRUE
def linear_interpolation(fig, axs):
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
f = interpolate.interp1d(xnew , y_arr)
axs.plot(xnew, f(xnew))
axs.set_title('linear')
def cubic_interpolation(fig, axs):
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
f = interpolate.interp1d(xnew , y_arr, kind='cubic')
axs.plot(xnew, f(xnew))
axs.set_title('cubic')
def cubic_spline_interpolation(fig, axs):
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
tck = interpolate.splrep(x_arr, y_arr, s=0) #always fail (ValueError: Error on input data)
ynew = interpolate.splev(xnew, tck, der=0)
axs.plot(xnew, ynew)
axs.set_title('cubic spline')
def parametric_spline_interpolation(fig, axs):
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
tck, u = interpolate.splprep([x_arr, y_arr], s=0)
out = interpolate.splev(xnew, tck)
axs.plot(out[0], out[1])
axs.set_title('parametric spline')
def univariate_spline_interpolated(fig, axs):
s = interpolate.InterpolatedUnivariateSpline(x_arr, y_arr)# ValueError: x must be strictly increasing
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
ynew = s(xnew)
axs.plot(xnew, ynew)
axs.set_title('univariate spline')
def rbf(fig, axs):
xnew = np.linspace(x_arr.min(), x_arr.max(), len(x_arr))
rbf = interpolate.Rbf(x_arr, y_arr) # numpy.linalg.linalg.LinAlgError: Matrix is singular.
fi = rbf(xnew)
axs.plot(xnew, fi)
axs.set_title('rbf')
def interpolation():
fig, axs = plt.subplots(nrows=4)
axs[0].plot(x_arr, y_arr, 'r-')
axs[0].set_title('org')
cubic_interpolation(fig, axs[1])
# cubic_spline_interpolation(fig, axs[2])
parametric_spline_interpolation(fig, axs[2])
# univariate_spline_interpolated(fig, axs[3])
# rbf(fig, axs[3])
linear_interpolation(fig, axs[3])
plt.show()
#------- main -------
if __name__ == "__main__":
# np.seterr(divide='ignore', invalid='ignore')
file_name = CUR_DIR + "/" + CIRCLE_FILE
convert_coord_to_array(file_name)
#file_name = CUR_DIR + "/" + CURVE_FILE
#convert_coord_to_array(file_name)
#file_name = CUR_DIR + "/" + SQUARE_FILE
#convert_coord_to_array(file_name)
#
interpolation()
circle x, y coordinate
307, 91
308, 90
339, 90
340, 91
348, 91
349, 92
351, 92
352, 93
357, 93
358, 94
361, 94
362, 95
364, 95
365, 96
369, 96
370, 97
374, 97
375, 98
376, 98
377, 99
379, 99
380, 100
382, 100
383, 101
386, 101
387, 102
389, 102
390, 103
392, 103
393, 104
394, 104
395, 105
398, 105
399, 106
400, 106
401, 107
402, 107
403, 108
405, 108
406, 109
407, 109
408, 110
410, 110
411, 111
413, 111
414, 112
415, 112
416, 113
417, 113
418, 114
419, 114
420, 115
421, 115
422, 116
423, 116
425, 118
426, 118
428, 120
429, 120
430, 121
430, 122
431, 122
433, 124
434, 124
435, 125
435, 126
437, 128
437, 129
441, 133
441, 134
442, 135
442, 137
443, 137
444, 138
444, 140
445, 141
445, 142
446, 143
446, 146
447, 147
447, 148
448, 149
448, 153
449, 154
449, 191
448, 192
448, 223
447, 224
447, 240
446, 241
446, 242
445, 243
445, 248
444, 249
444, 253
443, 254
443, 256
442, 257
442, 259
441, 260
441, 263
440, 264
440, 267
439, 268
439, 269
438, 270
438, 272
436, 274
436, 275
435, 276
435, 279
434, 280
434, 281
433, 282
433, 283
431, 285
431, 288
429, 290
429, 291
428, 292
428, 293
426, 295
426, 296
425, 297
425, 298
424, 299
424, 300
423, 301
423, 303
422, 304
422, 305
420, 307
420, 308
419, 309
419, 310
417, 312
417, 313
415, 315
415, 316
414, 317
414, 318
412, 320
411, 320
410, 321
410, 322
409, 323
409, 324
408, 325
407, 325
402, 330
401, 330
401, 331
399, 333
398, 333
395, 336
395, 337
394, 338
393, 338
390, 341
388, 341
387, 342
387, 343
386, 344
384, 344
383, 345
382, 345
380, 347
379, 347
377, 349
376, 349
374, 351
373, 351
373, 352
372, 353
370, 353
369, 354
368, 354
367, 355
366, 355
365, 356
364, 356
363, 357
362, 357
359, 360
358, 360
357, 361
356, 361
355, 362
353, 362
353, 363
352, 364
348, 364
347, 365
314, 365
313, 364
297, 364
296, 363
284, 363
283, 362
280, 362
279, 361
273, 361
272, 360
271, 360
270, 359
265, 359
264, 358
262, 358
261, 357
260, 357
258, 355
257, 355
256, 354
255, 354
252, 351
251, 351
246, 346
245, 346
237, 338
237, 337
235, 335
234, 335
231, 332
231, 331
230, 330
230, 329
222, 321
222, 320
217, 315
217, 314
213, 310
213, 309
210, 306
210, 305
204, 299
204, 298
203, 297
203, 296
199, 292
199, 291
198, 290
198, 289
197, 289
194, 286
194, 285
191, 282
191, 280
187, 276
187, 275
185, 273
185, 271
184, 270
184, 269
183, 268
183, 266
182, 265
182, 264
180, 262
180, 261
179, 260
179, 258
177, 256
177, 254
176, 253
176, 251
175, 250
175, 249
174, 248
174, 246
173, 245
173, 243
171, 241
171, 237
170, 236
170, 232
169, 231
169, 230
168, 229
168, 211
169, 210
169, 205
170, 204
170, 199
171, 198
171, 195
172, 194
172, 193
173, 192
173, 189
174, 188
174, 185
176, 183
176, 180
177, 179
177, 177
178, 176
178, 175
179, 174
179, 173
180, 172
180, 170
182, 168
182, 167
183, 166
183, 165
185, 163
185, 162
186, 161
186, 160
189, 157
189, 156
191, 154
191, 153
192, 152
192, 149
197, 144
197, 143
203, 137
204, 137
207, 134
208, 134
211, 131
213, 131
216, 128
217, 128
218, 127
219, 127
221, 125
222, 125
223, 124
224, 124
225, 123
226, 123
227, 122
228, 122
229, 121
231, 121
233, 119
234, 119
237, 116
239, 116
240, 115
241, 115
242, 114
244, 114
245, 113
246, 113
247, 112
250, 112
251, 111
252, 111
253, 110
256, 110
257, 109
258, 109
259, 108
262, 108
263, 107
266, 107
267, 106
269, 106
272, 103
274, 103
275, 102
276, 102
277, 101
278, 101
279, 100
281, 100
282, 99
283, 99
284, 98
286, 98
287, 97
288, 97
289, 96
290, 96
291, 95
293, 95
295, 93
298, 93
299, 92
302, 92
303, 91
Solved
def linear_interpolateion(self, x, y):
points = np.array([x, y]).T # a (nbre_points x nbre_dim) array
# Linear length along the line:
distance = np.cumsum( np.sqrt(np.sum( np.diff(points, axis=0)**2, axis=1 )) )
distance = np.insert(distance, 0, 0)
alpha = np.linspace(distance.min(), int(distance.max()), len(x))
interpolator = interpolate.interp1d(distance, points, kind='slinear', axis=0)
interpolated_points = interpolator(alpha)
out_x = interpolated_points.T[0]
out_y = interpolated_points.T[1]
return out_x, out_y

Because the interpolation is wanted for generic 2d curve i.e. (x, y)=f(s) where s is the coordinates along the curve, rather than y = f(x), the distance along the line s have to be computed first. Then, the interpolation for each coordinates is performed relatively to s. (for instance, in the circle case y = f(x) have two solutions)
s (or distance in the code here) is calculated as the cumulative sum of the length of each segments between the given points.
import numpy as np
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
# Define some points:
points = np.array([[0, 1, 8, 2, 2],
[1, 0, 6, 7, 2]]).T # a (nbre_points x nbre_dim) array
# Linear length along the line:
distance = np.cumsum( np.sqrt(np.sum( np.diff(points, axis=0)**2, axis=1 )) )
distance = np.insert(distance, 0, 0)/distance[-1]
# Interpolation for different methods:
interpolations_methods = ['slinear', 'quadratic', 'cubic']
alpha = np.linspace(0, 1, 75)
interpolated_points = {}
for method in interpolations_methods:
interpolator = interp1d(distance, points, kind=method, axis=0)
interpolated_points[method] = interpolator(alpha)
# Graph:
plt.figure(figsize=(7,7))
for method_name, curve in interpolated_points.items():
plt.plot(*curve.T, '-', label=method_name);
plt.plot(*points.T, 'ok', label='original points');
plt.axis('equal'); plt.legend(); plt.xlabel('x'); plt.ylabel('y');
which gives:
Regarding the graphs, it seems you are looking for a smoothing method rather than an interpolation of the points. Here, is a similar approach use to fit a spline separately on each coordinates of the given curve (see Scipy UnivariateSpline):
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline
# Define some points:
theta = np.linspace(-3, 2, 40)
points = np.vstack( (np.cos(theta), np.sin(theta)) ).T
# add some noise:
points = points + 0.05*np.random.randn(*points.shape)
# Linear length along the line:
distance = np.cumsum( np.sqrt(np.sum( np.diff(points, axis=0)**2, axis=1 )) )
distance = np.insert(distance, 0, 0)/distance[-1]
# Build a list of the spline function, one for each dimension:
splines = [UnivariateSpline(distance, coords, k=3, s=.2) for coords in points.T]
# Computed the spline for the asked distances:
alpha = np.linspace(0, 1, 75)
points_fitted = np.vstack( spl(alpha) for spl in splines ).T
# Graph:
plt.plot(*points.T, 'ok', label='original points');
plt.plot(*points_fitted.T, '-r', label='fitted spline k=3, s=.2');
plt.axis('equal'); plt.legend(); plt.xlabel('x'); plt.ylabel('y');
which gives:

Related

How to remove unwanted lines in azure(python)

/usr/local/lib/python3.8/dist-packages/attr/__init__.py 27 0 100%
/usr/local/lib/python3.8/dist-packages/attr/_cmp.py 55 45 18% 51-100, 108-114, 122-137, 144-147, 154
/usr/local/lib/python3.8/dist-packages/attr/_compat.py 96 48 50% 22-24, 28-107, 123, 132, 153-156, 175, 191-212, 234, 241-242
/usr/local/lib/python3.8/dist-packages/attr/_config.py 9 4 56% 19-22, 33
/usr/local/lib/python3.8/dist-packages/attr/_funcs.py 96 84 12% 54-116, 130-189, 225-289, 301, 323-341, 360-370, 409-422
/usr/local/lib/python3.8/dist-packages/attr/_make.py 977 346 65% 84, 87, 90, 115-116, 121, 274, 280, 285, 293, 296, 299, 351-352, 413, 431, 450, 457-481, 501-507, 529-532, 556, 581, 590-591, 602, 611, 623-634, 642, 649, 734-754, 763, 792-796, 807-810, 838-839, 847, 881, 914-915, 918, 929-939, 954, 962-971, 1011, 1064, 1069-1090, 1098-1099, 1105-1106, 1112-1113, 1130, 1134, 1145, 1156, 1163, 1170-1171, 1186, 1212-1216, 1501, 1509, 1514, 1523, 1552, 1571, 1576, 1583, 1596, 1610, 1620, 1641-1646, 1690-1698, 1722-1732, 1758-1762, 1788-1799, 1829, 1840-1843, 1849-1852, 1858-1861, 1867-1870, 1928, 1954-2015, 2047-2054, 2075-2082, 2093-2099, 2103, 2131, 2138, 2144-2147, 2149, 2200, 2213, 2224, 2235-2287, 2313, 2336, 2344, 2380, 2388-2396, 2407-2418, 2428, 2447, 2454-2469, 2488, 2544-2553, 2558-2560, 2564-2569, 2694, 2702, 2732-2734, 2748-2752, 2759, 2768, 2771-2776, 2925-2929, 2941-2946, 2981, 2987-2988, 3035-3079, 3095-3096, 3109-3117, 3135-3173
/usr/local/lib/python3.8/dist-packages/attr/_next_gen.py 37 24 35% 82-147, 175, 198, 214
/usr/local/lib/python3.8/dist-packages/attr/_version_info.py 37 17 54% 60-69, 72-77, 80-87
/usr/local/lib/python3.8/dist-packages/attr/converters.py 58 47 19% 40-62, 83-114, 143-155
/usr/local/lib/python3.8/dist-packages/attr/exceptions.py 18 4 78% 89-91, 94
/usr/local/lib/python3.8/dist-packages/attr/filters.py 16 9 44% 17, 32-37, 49-54
/usr/local/lib/python3.8/dist-packages/attr/setters.py 28 16 43% 21-26, 37, 46-55, 65-69
/usr/local/lib/python3.8/dist-packages/yaml/resolver.py 135 97 28% 22-23, 30, 33, 51-89, 92-112, 115-118, 122-141, 144-165
/usr/local/lib/python3.8/dist-packages/yaml/scanner.py 753 672 11% 39-44, 60-109, 115-123, 128-133, 137-141, 146-154, 159-258, 272-277, 286-293, 301-310, 314-321, 340-347, 351-355, 364-367, 374-388, 393-400, 403, 406, 411-422, 425, 428, 433-445, 448, 451, 456-468, 473-482, 487-515, 520-543, 548-599, 604-610, 615-621, 626-632, 635, 638, 643-649, 652, 655, 660-666, 671-679, 687-688, 693-696, 701-704, 709, 714-719, 724-729, 745-746, 772-785, 789-804, 808-825, 829-842, 846-855, 859-865, 869-874, 878-883, 887-897, 908-933, 937-974, 979-1049, 1054-1090, 1094-1104, 1108-1119, 1123-1132, 1141-1155, 1187-1226, 1230-1250, 1254-1268, 1276-1309, 1315-1346, 1352-1370, 1375-1395, 1399-1414, 1425-1435
/usr/local/lib/python3.8/dist-packages/yaml/serializer.py 85 70 18% 17-25, 28-34, 37-41, 47-58, 61-72, 75-76, 79-110
/usr/local/lib/python3.8/dist-packages/yaml/tokens.py
these lines are checking for other repos,
So how to remove all these unwanted pipelines in azure, while running the pipeline
Please provide the solution

Can a Dataframe of NBA players be sorted by various conditions: to combine the rows of players w/ multiple entries bc they played on many teams?

I want to remove any players who didn't have over 1000 MP(minutes played).
I could easily write:
league_stats= pd.read_csv("1996.csv")
league_stats = league_stats.drop("Player-additional", axis=1)
league_stats_1000 = league_stats[league_stats['MP'] > 1000]
However, because players sometimes play for multiple teams in a year...this code doesn't account for that.
For example, Sam Cassell has four entries and none are above 1000 MP, but in total his MP for the season was over 1000. By running the above code I remove him from the new dataframe.
I am wondering if there is a way to sort the Dataframe by matching Rank(the RK column gives players who played on different teams the same rank number for each team they played on) and then sort it by... if the total of their MP is 1000=<.
This is the page I got the data from: 1996-1997 season.
Above the data table and to the left of the blue check box there is a dropdown menu called "Share and Export". From there I clicked on "Get table as CSV (for Excel)". After that I saved the CSV to a text editor and change the file extension to .csv to upload it to Jupyter Notebook.
This is a solution I came up with:
url = 'https://www.basketball-reference.com/leagues/NBA_1997_totals.html'
df = pd.read_html(url)[0]
tot_df = df.loc[df['Tm'] == 'TOT']
mp_1000 = tot_df.loc[tot_df["MP"] < 1000]
# Create list of indexes with unnecessary entries to be removed. We have TOT and don't need these rows.
# *** For the record, I came up with this list by manually going through the data.
indexes_to_remove = [5,6,24, 25, 66, 67, 248, 249, 447, 448, 449, 275, 276, 277, 19, 20, 21, 377, 378, 477, 478, 479,
54, 55, 451, 452, 337, 338, 156, 157, 73, 74, 546, 547, 435, 436, 437, 142, 143, 421, 42, 43, 232,
233, 571, 572, 363, 364, 531, 532, 201, 202, 111, 112, 139, 140, 307, 308, 557, 558, 93, 94, 512,
513, 206, 207, 208, 250, 259, 286, 287, 367, 368, 271, 272, 102, 103, 34, 35, 457, 458, 190, 191,
372, 373, 165, 166
]
df_drop_tot = df.drop(labels=indexes_to_remove, axis=0)
df_drop_tot
First off, no need to manually download the csv and then read it into pandas. You can load in the table using pandas' .read_html().
And yes, you can simply get the list of ranks, player names, or whatever, that have greater than 1000 MP, then use that list to filter the dataframe.
import pandas as pd
url = 'https://www.basketball-reference.com/leagues/NBA_1997_totals.html'
df = pd.read_html(url)[0]
df = df[df['Rk'].ne('Rk')]
df['MP'] = df['MP'].astype(int)
players_1000_rk_list = list(df[df['MP'] >= 1000]['Rk']) #<- coverts the "Rk" column into a list. I can then use that in the next line to only keep the "Rk" values that are in the list of "Rk"s that are >= 1000 MPs
players_df = df[df['Rk'].isin(players_1000_rk_list)]
Output: filters down from 574 rows to 282 rows
print(players_df)
Rk Player Pos Age Tm G ... AST STL BLK TOV PF PTS
0 1 Mahmoud Abdul-Rauf PG 27 SAC 75 ... 189 56 6 119 174 1031
1 2 Shareef Abdur-Rahim PF 20 VAN 80 ... 175 79 79 225 199 1494
3 4 Cory Alexander PG 23 SAS 80 ... 254 82 16 146 148 577
7 6 Ray Allen* SG 21 MIL 82 ... 210 75 10 149 218 1102
10 9 Greg Anderson C 32 SAS 82 ... 34 63 67 73 225 322
.. ... ... .. .. ... .. ... ... ... .. ... ... ...
581 430 Walt Williams SF 26 TOR 73 ... 197 97 62 174 282 1199
582 431 Corliss Williamson SF 23 SAC 79 ... 124 60 49 157 263 915
583 432 Kevin Willis PF 34 HOU 75 ... 71 42 32 119 216 842
589 438 Lorenzen Wright C 21 LAC 77 ... 49 48 60 79 211 561
590 439 Sharone Wright C 24 TOR 60 ... 28 15 50 93 146 390
[282 rows x 30 columns]

Cluster objects by geometric coordinates (Y axis)

I've got a pandas DataFrame with records describing rectangles with absolute coordinates of all the 4 points: TL (top-left), TR (top-right), BL (bottom-left) and BR (bottom-right). As it is, the rects seem to follow a row-like pattern, where there are conspicuous clusters forming "rows", like in this picture:
The data look like this:
tl_x tl_y tr_x tr_y br_x br_y bl_x bl_y ht wd
0 1567 136 1707 136 1707 153 1567 153 17 140
1 1360 154 1548 154 1548 175 1360 175 21 188
2 1567 154 1747 154 1747 174 1567 174 20 180
3 1311 175 1548 175 1548 196 1311 196 21 237
4 1565 174 1741 174 1741 199 1565 199 25 176
5 1566 196 1753 196 1753 220 1566 220 24 187
...
I need to cluster these objects along the bl_y or br_y column (bottom Y coordinate) to produce a 2D list of "rows" like:
As you see, objects in each "row" may have slightly varying Y coordinates (not exactly equivalent in each cluster). What I basically need is some function to add a separate e.g. clustered_y column to the DF and then sort by this column.
What's the simplest way to go?
Given the dataframe you provided:
import pandas as pd
df = pd.DataFrame(
{
"tl_x": {0: 1567, 1: 1360, 2: 1567, 3: 1311, 4: 1565, 5: 1566},
"tl_y": {0: 136, 1: 154, 2: 154, 3: 175, 4: 174, 5: 196},
"tr_x": {0: 1707, 1: 1548, 2: 1747, 3: 1548, 4: 1741, 5: 1753},
"tr_y": {0: 136, 1: 154, 2: 154, 3: 175, 4: 174, 5: 196},
"br_x": {0: 1707, 1: 1548, 2: 1747, 3: 1548, 4: 1741, 5: 1753},
"br_y": {0: 153, 1: 175, 2: 174, 3: 196, 4: 199, 5: 220},
"bl_x": {0: 1567, 1: 1360, 2: 1567, 3: 1311, 4: 1565, 5: 1566},
"bl_y": {0: 153, 1: 175, 2: 174, 3: 196, 4: 199, 5: 220},
"ht": {0: 17, 1: 21, 2: 20, 3: 21, 4: 25, 5: 24},
"wd": {0: 140, 1: 188, 2: 180, 3: 237, 4: 176, 5: 187},
}
)
Here is one way to do it:
# Calculate distance between "br_y" values
df = df.sort_values(by="br_y")
df["previous"] = df["br_y"].shift(1).fillna(method="bfill")
df["distance"] = df["br_y"] - df["previous"]
# Group values if distance > 5% of "br_y" values mean (arbitrarily chosen)
clusters = df.copy().loc[df["distance"] > 0.05 * df["br_y"].mean()]
clusters["clustered_br_y"] = [f"row{i}" for i in range(clusters.shape[0])]
# Add clusters back to dataframe and cleanup
df = (
pd.merge(
how="left",
left=df,
right=clusters["clustered_br_y"],
left_index=True,
right_index=True,
)
.fillna(method="ffill")
.fillna(method="bfill")
.drop(columns=["previous", "distance"])
.reset_index(drop=True)
)
tl_x tl_y tr_x tr_y br_x br_y bl_x bl_y ht wd clustered_br_y
0 1567 136 1707 136 1707 153 1567 153 17 140 row0
1 1567 154 1747 154 1747 174 1567 174 20 180 row0
2 1360 154 1548 154 1548 175 1360 175 21 188 row0
3 1311 175 1548 175 1548 196 1311 196 21 237 row1
4 1565 174 1741 174 1741 199 1565 199 25 176 row1
5 1566 196 1753 196 1753 220 1566 220 24 187 row2

How to make dataframe from list of list

I have searched but till not able to figure out how to make data frame from below:
0 ([179, 142, 176, 177, 176, 180, 180, 180, 180,...
1 ([353, 314, 349, 349, 344, 359, 359, 359, 359,...
2 ([535, 504, 535, 535, 535, 540, 540, 540, 540,...
3 ([711, 664, 703, 703, 703, 721, 721, 721, 721,...
4 ([850, 810, 822, 822, 842, 857, 857, 857, 857,.
below is how single data looks
([179, 142, 176],
['Qtr- Oct-20','Qtr- Oct-20','Qtr- Oct-20',],
['High','Low','Close'],
[43.8, 26.05,33.1])
what i want is
0 1 2 3
0 179 Qtr- Oct-20 High 43.8
1 142 Qtr- Oct-20 Low 26.05
2 176 Qtr- Oct-20 High_Volume 1123132
3 177 Qtr- Oct-20 High_Delivery 42499
what i am getting
0
0 ([179, 142, 176, 177, 176, 180, 180, 180, 180,...
1 ([353, 314, 349, 349, 344, 359, 359, 359, 359,...
2 ([535, 504, 535, 535, 535, 540, 540, 540, 540,...
Let's do apply + pd.Series.explode:
pd.DataFrame(df['col'].tolist()).apply(pd.Series.explode).reset_index(drop=True)
0 1 2 3
0 179 Qtr- Oct-20 High 43.8
1 142 Qtr- Oct-20 Low 26.05
2 176 Qtr- Oct-20 Close 33.1
Note: df['col'] is the column in the dataframe which contains list of lists.

Pyparsing two-dimensional list

I have the following sample data:
165 150 238 402 395 571 365 446 284 278 322 282 236
16 5 19 10 12 5 18 22 6 4 5
259 224 249 193 170 151 95 86 101 58 49
6013 7413 8976 10392 12678 9618 9054 8842 9387 11088 11393;
It is the equivalent of a two dimensional array (except each row does not have an equal amount of columns). At the end of each line is a space and then a \n except for the final entry which is followed by no space and only a ;.
Would anyone know the pyparsing grammer to parse this? I've been trying something along the following lines but it will not match.
data = Group(OneOrMore(Group(OneOrMore(Word(nums) + SPACE)) + LINE) + \
Group(OneOrMore(Word(nums) + SPACE)) + Word(nums) + Literal(";")
The desired output would ideally be as follows
[['165', '150', '238', '402', '395', '571', '365', '446', '284', '278',
'322', '282', '236'], ['16', '5', ... ], [...], ['6013', ..., '11393']]
Any assistance would be greatly appreciated.
You can use the stopOn argument to OneOrMore to make it stop matching. Then, since newlines are by default skippable whitespace, the next group can start matching, and it will just skip over the newline and start at the next integer.
import pyparsing as pp
data_line = pp.Group(pp.OneOrMore(pp.pyparsing_common.integer(), stopOn=pp.LineEnd()))
data_lines = pp.OneOrMore(data_line) + pp.Suppress(';')
Applying this to your sample data:
data = """\
165 150 238 402 395 571 365 446 284 278 322 282 236
16 5 19 10 12 5 18 22 6 4 5
259 224 249 193 170 151 95 86 101 58 49
6013 7413 8976 10392 12678 9618 9054 8842 9387 11088 11393;"""
parsed = data_lines.parseString(data)
from pprint import pprint
pprint(parsed.asList())
Prints:
[[165, 150, 238, 402, 395, 571, 365, 446, 284, 278, 322, 282, 236],
[16, 5, 19, 10, 12, 5, 18, 22, 6, 4, 5],
[259, 224, 249, 193, 170, 151, 95, 86, 101, 58, 49],
[6013, 7413, 8976, 10392, 12678, 9618, 9054, 8842, 9387, 11088, 11393]]

Categories