Related
I create a ngram from Python and the output its something like this:
'vida': 113, 'sistema': 104, 'economía': 91, 'nacional': 84, 'mujeres': 76, 'derechos': 75, 'salud': 75, 'paz': 67, 'colombia': 66, 'social': 66, 'trabajo': 63, 'protección': 62, 'política': 61, 'país': 55, 'acceso': 53, 'cambio': 49, 'sociedad': 49, 'derecho': 49, 'educación': 47, 'cuidado': 46, 'así': 45, 'productiva': 44, 'condiciones': 44, 'cultura': 41, 'participación': 39, 'agua': 39, 'gobierno': 38, 'desarrollo': 38, 'integral': 37, 'construcción': 37, 'personas': 37, 'naturaleza': 35, 'territorios': 35, 'popular': 35, 'público': 35, 'pública': 34, 'territorio': 33, 'pueblos': 33, 'servicios': 33, 'todas': 33, 'población': 33, 'indígenas': 32, 'garantizaremos': 32, 'climático': 31, 'vivienda': 31, 'justicia': 31, 'políticas': 31, 'través': 31, 'producción': 30, 'cultural': 30, 'empleo': 30, 'calidad': 29, 'internacional': 29, 'manera': 29, 'pacto': 28, 'públicos': 28, 'mediante': 28, 'impulsaremos': 28, 'plan': 27, 'procesos': 27, 'territorial': 26, 'seguridad': 26, 'reconocimiento': 26, 'igualdad': 25, 'conocimiento': 25, 'afrodescendientes': 25, 'economías': 25, 'toda': 25, 'sociales': 25, 'nivel': 25, 'garantizar': 25, 'comunidades': 25, 'atención': 25, 'uso': 25, 'territoriales': 25, 'tierra': 24, 'víctimas': 24, 'transición': 24, 'trabajadores': 24, 'mayor': 24, 'programas': 24, 'enfoque': 24, 'base': 24, 'democratización': 23, 'productividad': 23, 'arte': 23, 'negros': 23, 'avanzaremos': 23, 'rurales': 23, 'públicas': 23, 'formas': 23, 'sector': 23, 'formación': 23, 'garantía': 22, 'lucha': 22, 'palenqueros': 22, 'pensión': 22, 'fin': 22, 'transformación': 22, 'marco': 22, 'culturales': 22, 'género': 21, 'sistemas': 21, 'ambiental': 21, 'modelo': 21, 'mundial': 20, 'niños': 20, 'gran': 20, 'alimentaria': 20, 'zonas': 20, 'bajo': 20, 'reforma': 20, 'potencia': 19, 'crédito': 19, 'primera': 19, 'raizales': 19, 'mayores': 19, 'fundamental': 19, 'organizaciones': 19, 'educativo': 19, 'comunitarias': 19, 'ambientales': 19, 'espacio': 18, 'industria': 18, 'jóvenes': 18, 'infancia': 18, 'niñas': 18, 'patrimonio': 18, 'alimentación': 18, 'económica': 18, 'instrumentos': 18, 'bienes': 18, 'generación': 18, 'recursos': 18, 'proyectos': 18, 'promoveremos': 18, 'laboral': 18, 'deporte': 17, 'realidad': 17, 'diversidad': 17, 'humana': 17, 'reparación': 17, 'programa': 17, 'soberanía': 17, 'garantizando': 17, 'diálogo': 17, 'mercado': 17, 'regional': 17, 'económico': 16, 'saber': 16, 'campo': 16, 'campesinado': 16, 'tiempo': 16, 'parte': 16, 'familias': 16, 'energía': 16, 'autonomía': 16, 'local': 16, 'rural': 16, 'populares': 16, 'apoyo': 16, 'fortalecimiento': 16, 'libre': 15, 'oportunidades': 15, 'haremos': 15, 'rrom': 15, 'adultos': 15, 'basada': 15, 'capacidad': 15, 'millones': 15, 'libertad': 15, 'acuerdo': 15, 'superar': 15, 'valor': 15, 'energética': 15, 'fortaleceremos': 15, 'centros': 15, 'gestión': 15, 'ser': 15, 'garantías': 14, 'agenda': 14, 'corrupción': 14, 'nuevo': 14, 'crearemos': 14, 'hombres': 14, 'superior': 14, 'general': 14, 'tierras': 14, 'espacios': 14, 'capacidades': 14, 'ingresos': 14, 'nacionales': 14, 'mejorar': 14, 'prácticas': 14, 'servicio': 14, 'control': 14, 'alrededor': 13, 'desigualdad': 13, 'convivencia': 13, 'nación': 13, 'diversidades': 13, 'saberes': 13, 'puedan': 13, 'grandes': 13, 'institucionalidad': 13, 'áreas': 13, 'riesgo': 13, 'infraestructura': 13, 'financiamiento': 13, 'transporte': 13, 'cada': 13, 'particular': 13, 'red': 12, 'vivir': 12, 'defensa': 12, 'fundamentales': 12, 'democracia': 12, 'efectiva': 12, 'permita': 12, 'colombiana': 12, 'productores': 12, 'universal': 12, 'creación': 12, 'mínimo': 12, 'permitan': 12, 'especial': 12, 'articulación': 12, 'distribución': 12, 'regionales': 12, 'hacer': 12, 'actividades': 12, 'locales': 12, 'productos': 12, 'intercultural': 12, 'tecnologías': 12, 'entidades': 12, 'sectores': 12, 'memoria': 12, 'negocio': 11, 'física': 11, 'poblaciones': 11, 'violencia': 11, 'trabajos': 11, 'impuestos': 11, 'promoción': 11, 'frente': 11, 'cuidados': 11, 'bienestar': 11, 'digno': 11, 'reducción': 11, 'dentro': 11, 'investigación': 11, 'debe': 11, 'medidas': 11, 'implica': 11, 'comunitarios': 11, 'cobertura': 11, 'incluyendo': 11, 'nuevas': 11, 'autoridades': 11, 'aprovechamiento': 11
I want to create a cdv file from it, that cotain two columns "words" and "Count" in a csv file, Im looking for a way fo how to do it in Python o R
Thanks for the help!
In tidyverse, we can split by : with separate_rows and split the column again into two with separate
library(dplyr)
library(tidyr)
library(stringr)
tibble(col1 = str1) %>%
separate_rows(col1, sep = ",\\s*") %>%
separate(col1, into = c("key", "value"), sep = ":\\s*", convert = TRUE) %>%
mutate(key = str_remove_all(key, "'"))
-output
# A tibble: 261 × 2
key value
<chr> <dbl>
1 vida 113
2 sistema 104
3 economía 91
4 nacional 84
5 mujeres 76
6 derechos 75
7 salud 75
8 paz 67
9 colombia 66
10 social 66
# … with 251 more rows
You can try
x <- "the string in your question"
df <- read.table(textConnection(gsub("," ,"\n" , x)) , sep = ":")
write.csv(df, "myngram" , row.names = F)
Here is one more tidyverseapproach:
Same as #akrun we separate the rows by , , then we use str_extract from stringr package twice with regex:
library(tidyverse)
str1 %>%
as_tibble() %>%
separate_rows(value, sep=", ") %>%
mutate(key = str_extract(value, '[A-Za-z]+'),
value = as.numeric(str_extract(value, '[0-9]+')), .before=1)
key value
<chr> <dbl>
1 vida 113
2 sistema 104
3 econom 91
4 nacional 84
5 mujeres 76
6 derechos 75
7 salud 75
8 paz 67
9 colombia 66
10 social 66
# ... with 251 more rows
import matplotlib.pyplot as plt
import numpy as np
randomnums = np.random.normal(loc=9,scale=6, size=400).astype(int)+15
Output:
array([25, 22, 19, 26, 24, 9, 19, 32, 30, 25, 29, 17, 21, 14, 17, 27, 27,
28, 17, 17, 20, 21, 16, 28, 20, 24, 15, 20, 20, 13, 33, 21, 30, 27,
8, 22, 24, 25, 23, 13, 24, 20, 16, 32, 15, 26, 34, 16, 21, 21, 28,
22, 23, 18, 20, 22, 23, 22, 23, 26, 22, 25, 19, 29, 14, 27, 21, 23,
24, 19, 25, 15, 22, 23, 19, 19, 23, 21, 22, 17, 25, 15, 24, 25, 23 ...
h = sorted(randomnums)
plt.hist(h,density=False)
plt.show()
Output:
From my research I found only how to plot numbers on top of a bar chart, but what I want is to plot on top of a histogram chart. Is it possible?
An adapted version of the answer I linked in the comments of the question. Thanks a lot for the suggestions in the comments below this post!
import matplotlib.pyplot as plt
import numpy as np
h = np.random.normal(loc=9,scale=6, size=400).astype(int)+15
fig, ax = plt.subplots(figsize=(16, 10))
ax.hist(h, density=False)
for rect in ax.patches:
height = rect.get_height()
ax.annotate(f'{int(height)}', xy=(rect.get_x()+rect.get_width()/2, height),
xytext=(0, 5), textcoords='offset points', ha='center', va='bottom')
...gives e.g.
See also: matplotlib.axes.Axes.annotate.
I have following file data.txt
This file contains number of bounding boxes and their respective heights. I have wrote a function to extract the heights of all the boxes from json input data.txt respectively:
heights [43, 17, 23, 24, 17, 27, 19, 19, 24, 22, 8, 8, 26, 25, 18, 19,
20, 20, 20, 21, 20, 20, 22, 18, 18, 19, 19, 16, 13, 20, 20, 19, 19,
20, 13, 20, 18, 18, 13, 12, 19, 25, 17, 13, 38, 38, 20, 19, 16]
I have wrote following script to plot the height of each box
box_number=[]
box_height=[]
for index2, num2 in enumerate(heights):
print('box number',index2, 'box height',num2)
box_number.append(index2)
box_height.append(num2)
#ax = sns.lineplot(box_number, box_height);
ax = sns.stripplot(box_number, box_height);
ax.set(xlabel ='box number', ylabel ='height of box')
# giving title to the plot
plt.title('My first graph');
# function to show plot
plt.show()
here's the output:
I want to write a function to print boxes which are very tall in height and which are deviant from the mean value of height . In short print box number 0,44 and 45. How can I do this?
(Every time I will get a different set of boxes but I'll have to find a mean value of their height and print boxes which are too tall)
There are several strategies to discover outliers. The definition of outlier is what matters at the end of the day. If you want a simple computation as you described, you can do something like this:
import numpy as np
# heights
hs = [43, 17, 23, 24, 17, 27, 19, 19, 24, 22, 8, 8, 26, 25, 18, 19, 20, 20, 20, 21, 20,
20, 22, 18, 18, 19, 19, 16, 13, 20, 20, 19, 19, 20, 13, 20, 18, 18, 13, 12, 19,
25, 17, 13, 38, 38, 20, 19, 16]
# let's say that an outlier is a height that is farther than 2*std from the mean
outliers_definition = np.abs(hs - np.mean(hs)) > 2 * np.std(hs)
# you can get their indexes this way
outliers_idx = np.argwhere(outliers_definition)
print(outliers_idx)
# array([[ 0],
# [44],
# [45]], dtype=int64)
Notice that the mean here is also taking the outliers into account. You could use the median, for example. If you want something more robust, there is a vast literature on outlier detection. I recommend you to take a look at it.
I can initialize a numpy array and reshape it at the time of creation.
test = np.arange(32).reshape(4, 8)
which produces this:
array([[ 0, 1, 2, 3, 4, 5, 6, 7],
[ 8, 9, 10, 11, 12, 13, 14, 15],
[16, 17, 18, 19, 20, 21, 22, 23],
[24, 25, 26, 27, 28, 29, 30, 31]])
... but I'd like to know how to start the sequential numbering at a given point, say at 13 rather than at 0. How is that done in numpy?
I've looked for answers and found something somewhat similar but it seems there would be a numpy command to do this.
arange takes an optional start argument.
start = 13 # Any number works here
np.arange(start, start + 32).reshape(4, 8)
# array([[13, 14, 15, 16, 17, 18, 19, 20],
# [21, 22, 23, 24, 25, 26, 27, 28],
# [29, 30, 31, 32, 33, 34, 35, 36],
# [37, 38, 39, 40, 41, 42, 43, 44]])
Given these test cases:
votes = [6]*28
m = 10
votes1 = [5]*28+[6]*2
m1 = 10
votes2 = [5]*29+[10]*1
m2 = 10
votes3 = [8, 8, 16, 12, 12, 12, 4, 4, 12, 4, 4, 4, 8, 12, 12, 8, 8, 16, 12, 4, 16, 16, 12, 16, 12, 16, 12, 4, 16, 4, 4, 12, 4, 12, 12, 4, 16, 12, 16, 8]
m3 = 20
votes4 = [22, 21, 34, 39, 28, 33, 32, 40, 22, 34, 36, 27, 37, 34, 40, 38, 39, 32, 37, 40, 31, 37, 22, 21, 35, 34, 24, 40, 34, 21, 24, 20, 30, 31, 22, 30, 31, 25, 20, 38, 24, 23, 32, 27, 20, 31, 27, 32, 22, 32, 33, 34, 40, 38, 36, 29, 34, 24, 24, 39, 32, 37, 30, 20, 29, 26, 36, 40, 34, 22, 30, 27, 38, 27, 26, 28, 23, 40, 31, 22, 23, 35, 23, 31, 23, 39, 30, 20, 20, 35, 27, 23, 23, 29, 40, 20, 34, 40, 28, 25]
m4 = 50
votes5 = [25, 25, 25, 24, 25, 24, 24, 25, 26, 25, 26, 24, 25, 26, 24, 26, 24, 26, 26, 25, 26, 24, 26, 24, 26, 26, 26, 25, 25, 26, 24, 26, 25, 25, 24, 25, 25, 26, 26, 26, 25, 26, 25, 26, 25, 25, 24, 24, 24, 25, 24, 26, 25, 24, 26, 24, 24, 26, 24, 26, 24, 24, 24, 26, 24, 25, 24, 26, 25, 25, 26, 25, 25, 25, 25, 26, 25, 24, 25, 25, 24, 24, 24, 26, 26, 26, 25, 24, 25, 25, 25, 26, 25, 24, 26, 24, 25, 26, 24, 26]
m5 = 50
Given the following bounds:
def upperbound(v, m, n):
return math.floor(v - 0.25 * m * n - 1)
def lowerbound(m, n):
return math.ceil(0.25 * m * n + 1)
I would like to find if there is a sum of subset with length exactly len(votes)/2 that would satisfy the given upperbound and lowerbound.
Below is my attempt at solving the problem using the knapsack but it doesn't take into account the length of the subset.
import math
def winnable(votes, m):
n = len(votes) # Number of columns
v = sum(votes)
ub = upperbound(v, m, n)
lb = lowerbound(m, n)
max_possible = knapSack(ub, votes, n)
if max_possible < lb:
return "not possible"
else:
return "possible"
def knapSack(ub, val, n):
K = [[0 for x in range(ub + 1)] for x in range(n + 1)]
# Build table K[][] in bottom up manner
for i in range(n + 1):
for w in range(ub + 1):
if i == 0 or w == 0:
K[i][w] = 0
elif val[i - 1] <= w:
K[i][w] = max(val[i - 1] + K[i - 1][w - val[i - 1]], K[i - 1][w])
else:
K[i][w] = K[i - 1][w]
return K[n][ub]
Is it possible to further modify my solution to take into account the number of elements in the subset.
I have implemented #mrmcgreg's proposal of adding an additional dimension to the problem
import math
def has_valid_subset(votes, m):
n = len(votes)
sum_min = math.ceil(0.25 * m * n + 1)
sum_max = math.floor(sum(votes) - 0.25 * m * n - 1)
n_half = n // 2
K = [[[(False, 0) for elements in range(min(n_half + 1, index + 1))]
for index in range(len(votes) + 1)]
for weight in range(sum_max + 1)]
for weight in range(sum_max + 1):
for index in range(len(votes) + 1):
if index == 0:
K[weight][index][0] = (True, 0)
continue
v = votes[index - 1]
for elements in range(min(n_half + 1, index)):
if v > weight:
K[weight][index][elements] = K[weight][index - 1][elements]
else:
skip_ok, skip_w = K[weight][index - 1][elements]
add_ok, add_prev_w = K[weight - v][index - 1][elements - 1]
add_w = add_prev_w + v
if skip_ok and add_ok:
K[weight][index][elements] = (True, max(skip_w, add_w))
elif skip_ok:
K[weight][index][elements] = (True, skip_w)
elif add_ok:
K[weight][index][elements] = (True, add_w)
b_max, w_max = K[-1][-1][-1]
if not b_max:
return False
return w_max >= sum_min
Tests:
votes = [6]*28
m = 10
print(has_valid_subset(votes, m))
# True
votes1 = [5]*28+[6]*2
m1 = 10
print(has_valid_subset(votes1, m1))
# True
votes2 = [5]*29+[10]*1
m2 = 10
print(has_valid_subset(votes2, m2))
# False
votes3 = [8, 8, 16, 12, 12, 12, 4, 4, 12, 4, 4, 4, 8, 12, 12, 8, 8, 16, 12, 4, 16, 16, 12, 16, 12, 16, 12, 4, 16, 4, 4, 12, 4, 12, 12, 4, 16, 12, 16, 8]
m3 = 20
print(has_valid_subset(votes3, m3))
# False
votes4 = [22, 21, 34, 39, 28, 33, 32, 40, 22, 34, 36, 27, 37, 34, 40, 38, 39, 32, 37, 40, 31, 37, 22, 21, 35, 34, 24, 40, 34, 21, 24, 20, 30, 31, 22, 30, 31, 25, 20, 38, 24, 23, 32, 27, 20, 31, 27, 32, 22, 32, 33, 34, 40, 38, 36, 29, 34, 24, 24, 39, 32, 37, 30, 20, 29, 26, 36, 40, 34, 22, 30, 27, 38, 27, 26, 28, 23, 40, 31, 22, 23, 35, 23, 31, 23, 39, 30, 20, 20, 35, 27, 23, 23, 29, 40, 20, 34, 40, 28, 25]
m4 = 50
print(has_valid_subset(votes4, m4))
# True
votes5 = [25, 25, 25, 24, 25, 24, 24, 25, 26, 25, 26, 24, 25, 26, 24, 26, 24, 26, 26, 25, 26, 24, 26, 24, 26, 26, 26, 25, 25, 26, 24, 26, 25, 25, 24, 25, 25, 26, 26, 26, 25, 26, 25, 26, 25, 25, 24, 24, 24, 25, 24, 26, 25, 24, 26, 24, 24, 26, 24, 26, 24, 24, 24, 26, 24, 25, 24, 26, 25, 25, 26, 25, 25, 25, 25, 26, 25, 24, 25, 25, 24, 24, 24, 26, 26, 26, 25, 24, 25, 25, 25, 26, 25, 24, 26, 24, 25, 26, 24, 26]
m5 = 50
print(has_valid_subset(votes5, m5))
# True
Here is the heart of a reasonably efficient solution. Lightly tested but probably correct. (Biggest question, did you want the solution to include the bounds, or exclude them?)
def subset_sum_of_len_in_range (vals, total_to_use, lower, upper):
# Sorting it makes it easy to calculate min/max of partial sum
# from here - it will be the beginning or end. This will be
# useful in filtering.
sorted_vals = sorted(vals)
# Precalculating partial sums from beginning makes partial sum
# of a range even easier - just subtract two.
total = 0
cum_prev_sum = [total]
for i in sorted_vals:
total = total + i
cum_prev_sum.append(total)
# It is always easiest to solve DP problems by caching recursive ones.
cache = {}
# And now our recursive cached solver.
def sub_problem (position, to_use, current_sum):
if len(vals) - position < to_use:
# Not enough values left to possibly solve this.
return False
cache_key = (position, to_use, current_sum)
if cache_key not in cache:
lowest_sum = current_sum + cum_prev_sum[position + to_use] - cum_prev_sum[position]
if upper < lowest_sum:
# Can't possibly get in range.
cache[cache_key] = False
return False
elif lower <= lowest_sum:
# Found one in range!
cache[cache_key] = True
return True
highest_sum = current_sum + cum_prev_sum[len(vals)] - cum_prev_sum[len(vals) - to_use]
if highest_sum < lower:
# Can't possibly get in range.
cache[cache_key] = False
return False
elif highest_sum <= upper:
# Found one in range!
cache[cache_key] = True
return True
# Now try recursion.
if sub_problem(position + 1, to_use, current_sum):
# There is a solution that did not use this value
cache[cache_key] = True
elif sub_problem(position + 1, to_use-1, current_sum + vals[position]):
# There is a solution that did use this value
cache[cache_key] = True
else:
# There is no solution.
cache[cache_key] = False
return cache[cache_key]
return sub_problem(0, total_to_use, 0)
To use it for your problem...
import math
def upperbound(v, m, n):
return math.floor(v - 0.25 * m * n - 1)
def lowerbound(m, n):
return math.ceil(0.25 * m * n + 1)
def winnable(votes, m):
n = len(votes) # Number of columns
v = sum(votes)
ub = upperbound(v, m, n)
lb = lowerbound(m, n)
if subset_sum_of_len_in_range(votes, n/2, lb, ub):
return "possible"
else:
return "not possible"