Cython: for i from 1 <= i < N - python

I'm learning Cython and came across this snippit of code:
import numpy as np
cimport numpy as np
def mean(np.ndarray[np.double_t] input):
cdef np.double_t cur
# Py_ssize_t is numpy's index type
cdef Py_ssize_t i
cdef Py_ssize_t N = len(input)
for i from 0 <= i < N:
cur += input[i]
return cur / N
a=np.array([1,2,3,4], dtype=np.double)
Obviously, this returns the mean of a which is 2.5. My question is this:
Is the for loop a Python loop, Cython, or C?

Compile it and see: the C code that Cython produces is nicely annotated.
/* "cyexample.pyx":11
* cdef Py_ssize_t N = len(input)
*
* for i from 0 <= i < N: # <<<<<<<<<<<<<<
* cur += input[i]
*
*/
__pyx_t_1 = __pyx_v_N;
for (__pyx_v_i = 0; __pyx_v_i < __pyx_t_1; __pyx_v_i++) {
/* "cyexample.pyx":12
*
* for i from 0 <= i < N:
* cur += input[i] # <<<<<<<<<<<<<<
*
* return cur / N
*/
__pyx_t_2 = __pyx_v_i;
__pyx_t_3 = -1;
if (__pyx_t_2 < 0) {
__pyx_t_2 += __pyx_bshape_0_input;
if (unlikely(__pyx_t_2 < 0)) __pyx_t_3 = 0;
} else if (unlikely(__pyx_t_2 >= __pyx_bshape_0_input)) __pyx_t_3 = 0;
if (unlikely(__pyx_t_3 != -1)) {
__Pyx_RaiseBufferIndexError(__pyx_t_3);
{__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
}
__pyx_v_cur = (__pyx_v_cur + (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_double_t *, __pyx_bstruct_input.buf, __pyx_t_2, __pyx_bstride_0_input)));
}
And so the loop itself is successfully turned into C. Note that these days Cython can handle range naturally, so the older "from 0 <= i < N" style isn't necessary. The point of introducing the (non-Python) "for/from" syntax was to signify which loops should be C-ified.

for..from seems to be a Pyrex / Cython loop: http://docs.cython.org/src/userguide/language_basics.html#integer-for-loops

Related

Primality test Python C extension is slower than pure Python

I've implemented a 6k+-1 primality test function both in a C extension and pure Python code but seems pure Python code is much faster! is there something wrong with my C code or something else?
I also compiled a similar test in pure C with the is_prime function, and its execution time was the same as the C extension (almost 2sec)
primemodule.c
#define PY_SSIZE_T_CLEAN
#include "Python.h"
int is_prime(int n)
{
int i;
if (n <= 3)
return (n > 1);
if (n % 2 == 0 || n % 3 == 0)
return (0);
i = 5;
while ((i * i) <= n)
{
if (n % i == 0 || n % (i + 2) == 0)
return (0);
i += 6;
}
return (1);
}
static PyObject *prime_isprime(PyObject *self, PyObject *args)
{
int n;
if (!PyArg_ParseTuple(args, "i", &n))
return (NULL);
if (is_prime(n))
Py_RETURN_TRUE;
Py_RETURN_FALSE;
}
static PyMethodDef prime_methods[] = {
{"is_prime", prime_isprime, METH_VARARGS, "Check if a number is prime"},
{NULL, NULL, 0, NULL}};
static struct PyModuleDef prime_module = {
PyModuleDef_HEAD_INIT,
"prime",
NULL,
-1,
prime_methods};
PyMODINIT_FUNC PyInit_prime(void)
{
return (PyModule_Create(&prime_module));
}
py_test.py
import time
MAX_INT = 2147483647
def is_prime(n: int) -> bool:
if n <= 3:
return n > 1
if n % 2 == 0 or n % 3 == 0:
return False
i = 5
while i ** 2 <= n:
if n % i == 0 or n % (i + 2) == 0:
return False
i += 6
return True
t1 = time.process_time()
for i in range(MAX_INT - 100, MAX_INT):
is_prime(i)
print(time.process_time() - t1, "seconds")
c_test.py
import time
import prime
MAX_INT = 2147483647
t1 = time.process_time()
for i in range(MAX_INT - 100, MAX_INT):
prime.is_prime(i)
print(time.process_time() - t1, "seconds")
python c_test.py
2.078125 seconds
python py_test.py
0.03125 seconds
timecmd.bat a.exe
2.13 seconds
I think your C implementation is buggy regarding integer overflows and signedness and ends up in a bigger loop than the Python version.
Changing the parameter type to unsigned int (and i too, since otherwise that's a compiler warning):
static int is_prime(unsigned int n)
{
unsigned int i;
if (n <= 3)
return (n > 1);
if (n == 2 || n == 3)
return (1);
if (n % 2 == 0 || n % 3 == 0)
return (0);
i = 5;
while ((i * i) <= n)
{
if (n % i == 0 || n % (i + 2) == 0)
return (0);
i += 6;
}
return (1);
}
makes it (anecdotally, on my machine, approximately) 37 times faster than the Python implementation.

Creating a snowflake in Python

I am trying to create a program in Python that creates a snowflake based on the input of a number. Below is my code:
n = int(input())
a = [["."] * n] * n
temp = n/2
start_point = 0
mid_point = int(temp)
end_point = n - 1
for i in range(n):
if i > mid_point + 1:
start_point -= 1
end_point += 1
for j in range(n):
if (j == start_point) or (j == mid_point) or (j == end_point) or (i == mid_point):
a[i][j] = "*"
else:
a[i][j] = "."
if i < mid_point - 1:
start_point += 1
end_point -= 1
for row in a:
print(' '.join([str(elem) for elem in row]))
For example, if the input is '5' the output should look like:
* . * . *
. * * * .
* * * * *
. * * * .
* . * . *
However, my output looks like:
. * * * .
. * * * .
. * * * .
. * * * .
. * * * .
I was sure that my code was correct so I rewrote it in Java as:
public class Snowflake {
public static void createSnowflake(int n) {
String[][] array = new String[n][n];
float temp = (float) (n/2);
System.out.println(temp);
int start_point = 0;
int mid_point = (int) (temp);
System.out.println(mid_point);
int end_point = n - 1;
for(int i = 0; i < n; i++) {
if(i > mid_point+1) {
start_point--;
end_point++;
}
for(int j = 0; j < n; j++) {
if((j == start_point) || (j == mid_point) || (j == end_point) || (i == mid_point)) {
array[i][j] = "*";
}
else {
array[i][j] = ".";
}
}
if(i < mid_point-1) {
start_point++;
end_point--;
}
}
for(int i = 0; i < n; i++) {
for(int j = 0; j < n; j++) {
System.out.print(array[i][j]);
}
System.out.print("\n");
}
}
public static void main(String[] args) {
createSnowflake(5);
}
}
And it worked as expected. To my eyes the underlying logic is exactly the same, and yet the Java code works and the Python code doesn't. Could someone help me find where I've made a mistake in the Python syntax or how my Java code somehow differs from it?
If you change the creation of a to:
a= [["." for j in range(n)] for i in range(n)]
it should fix it.
This has to do with the way python copies lists.
Check the question linked on the comments to your question.
Enjoyed this question, I feel like it could only be here during this time of the year.

How correctly draw boxes in pyprocessing?

I'm trying to write a very basic box drawing program using pyprocessing,
but a condition to check if the mouse is within a box fails, when the logic looks ok:
#!/usr/bin/env python
from pyprocessing import *
S = 20
W = 5
H = 5
data = [[0] * W] * H
def setup():
size(W*(S+5),H*(S+5))
def draw():
background(0)
for y in xrange(H):
for x in xrange(W):
fill(data[x][y] * 255)
rect(x*S,y*S,S,S)
def mouseDragged():
for y in xrange(H):
for x in xrange(W):
xs = x * S
ys = y * S
# this doesn't behave as expected: it should draw a single box if the condition is met, not the whole row
if (mouse.x >= xs) and (mouse.x <= (xs+S)) and (mouse.y >= ys and mouse.y <= (ys+S)):
if key.pressed:
data[x][y] = 0
else:
data[x][y] = 1
run()
I've tried the same approach using the Java version of Processing and it works as expected:
int S = 20;
int W = 5;
int H = 5;
int[][] data = new int[W][H];
void setup(){
size(100,100);
noStroke();
}
void draw(){
background(0);
for (int y = 0 ; y < H; y++){
for (int x = 0 ; x < W; x++){
fill(data[x][y] * 255);
rect(x*S,y*S,S,S);
}
}
}
void mouseDragged(){
for (int y = 0 ; y < H; y++){
for (int x = 0 ; x < W; x++){
int xs = x * S;
int ys = y * S;
if ((mouseX > xs) && (mouseX < (xs+S)) && (mouseY >= ys && mouseY <= (ys+S))){
data[x][y] = 1;
}
}
}
}
Similar behaviour in JS:
var S = 20;
var W = 5;
var H = 5;
var data = new Array(W);
function setup(){
createCanvas(100,100);
noStroke();
for (var i = 0 ; i < H; i++) data[i] = [0,0,0,0,0];
}
function draw(){
background(0);
for (var y = 0 ; y < H; y++){
for (var x = 0 ; x < W; x++){
fill(data[x][y] * 255);
rect(x*S,y*S,S,S);
}
}
}
function mouseDragged(){
for (var y = 0 ; y < H; y++){
for (var x = 0 ; x < W; x++){
var xs = x * S;
var ys = y * S;
if ((mouseX > xs) && (mouseX < (xs+S)) && (mouseY >= ys && mouseY <= (ys+S))){
data[x][y] = 1;
}
}
}
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/0.4.23/p5.min.js"></script>
Am I writing the box bounds condition correctly in Python ? If so, is there a bug with pyprocessing ? How can I get past it ?
I'm using pyprocessing.version '0.1.3.22'
Trying to be lazy is the issue:
data = [[0] * W] * H
This doesn't simply create a nested array, it copies the references of the first array ([0]), so when I modify one value in one row, the whole row is modified.
Since I'm super experienced with python, I've initialised the array in a probably a non-pythonic way:
data = []
for y in xrange(H):
data.append([])
for x in xrange(W):
data[y].append(0)
So the full working code is:
#!/usr/bin/env python
from pyprocessing import *
S = 20
W = 5
H = 5
# data = [[0] * W] * H #trouble
data = []
for y in xrange(H):
data.append([])
for x in xrange(W):
data[y].append(0)
def setup():
size(W*(S),H*(S))
def draw():
background(0)
for y in xrange(H):
for x in xrange(W):
fill(data[x][y] * 255)
rect(x*S,y*S,S,S)
def mouseDragged():
for y in xrange(H):
for x in xrange(W):
xs = x * S
ys = y * S
if (mouse.x >= xs) and (mouse.x <= (xs+S)) and (mouse.y >= ys and mouse.y <= (ys+S)):
if key.pressed:
data[x][y] = 0
else:
data[x][y] = 1
run()

Cython float division PyExc_ZeroDivisionError checking

I'm doing some loop-intensive calculations and converted the code into Cython.
I did profiling with cython -a option, and inspected the .html file, and it seems whenever I do the float division, there is somewhat yellow line and it does something like the following:
if (unlikely(__pyx_t_37 == 0)) {
PyErr_Format(PyExc_ZeroDivisionError, "float division");
{__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
}
I guess it is for the cases where the divider is 0. I am using a constant for that and there is no probability that the divider is 0, and I was wondering if there is anything I can do to make it faster.
You need to add #cython.cdivision(True) to avoid the exception checking.
import cython
cdef double pydivision():
cdef int i
cdef double k, j
k = 2.0
j = 0.0
for i in range(10):
j += i/k
# Generated code: Python exception checking
# /* "checksum.pyx":9
# * j = 0.0
# * for i in range(10):
# * j += i/k # <<<<<<<<<<<<<<
# * return j
# *
# */
# if (unlikely(__pyx_v_k == 0)) {
# PyErr_Format(PyExc_ZeroDivisionError, "float division");
# {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
# }
# __pyx_v_j = (__pyx_v_j + (__pyx_v_i / __pyx_v_k));
# }
return j
#This decorator works wonders
#cython.cdivision(True)
cdef double cdivision():
cdef int i
cdef double k, j
k = 2.0
j = 0.0
for i in range(10):
j += i/k
# Generated code: no exception checking
# /* "checksum.pyx":20
# * j = 0.0
# * for i in range(10):
# * j += i/k # <<<<<<<<<<<<<<
# * return j
# *
# */
# __pyx_v_j = (__pyx_v_j + (__pyx_v_i / __pyx_v_k));
# }
return j
If the divisor is constant, you can multiply by 1/divisor instead

What algorithm does Python employ in fractions.gcd()?

I'm using the fractions module in Python v3.1 to compute the greatest common divisor. I would like to know what algorithm is used. I'm guessing the Euclidean method, but would like to be sure. The docs (http://docs.python.org/py3k/library/fractions.html?highlight=fractions.gcd#fractions.gcd) don't help. Can anybody clue me in?
According to the 3.1.2 source code online, here's gcd as defined in Python-3.1.2/Lib/fractions.py:
def gcd(a, b):
"""Calculate the Greatest Common Divisor of a and b.
Unless b==0, the result will have the same sign as b (so that when
b is divided by it, the result comes out positive).
"""
while b:
a, b = b, a%b
return a
So yes, it's the Euclidean algorithm, written in pure Python.
From fractions python
"Deprecated since version 3.5: Use math.gcd() instead."
I was looking for the algorithm as well. I hope it helped.
Since Python 3.5, the GCD code has been moved to math.gcd. Since Python 3.9, math.gcd takes an arbitrary number of arguments.
The actual GCD code is now implemented in C (for CPython), making it significantly faster than the original pure Python implementation.
Boilerplate:
static PyObject *
math_gcd(PyObject *module, PyObject * const *args, Py_ssize_t nargs)
{
PyObject *res, *x;
Py_ssize_t i;
if (nargs == 0) {
return PyLong_FromLong(0);
}
res = PyNumber_Index(args[0]);
if (res == NULL) {
return NULL;
}
if (nargs == 1) {
Py_SETREF(res, PyNumber_Absolute(res));
return res;
}
PyObject *one = _PyLong_GetOne(); // borrowed ref
for (i = 1; i < nargs; i++) {
x = _PyNumber_Index(args[i]);
if (x == NULL) {
Py_DECREF(res);
return NULL;
}
if (res == one) {
/* Fast path: just check arguments.
It is okay to use identity comparison here. */
Py_DECREF(x);
continue;
}
Py_SETREF(res, _PyLong_GCD(res, x));
Py_DECREF(x);
if (res == NULL) {
return NULL;
}
}
return res;
}
The actual computation uses Lehmer's GCD algorithm:
PyObject *
_PyLong_GCD(PyObject *aarg, PyObject *barg)
{
PyLongObject *a, *b, *c = NULL, *d = NULL, *r;
stwodigits x, y, q, s, t, c_carry, d_carry;
stwodigits A, B, C, D, T;
int nbits, k;
Py_ssize_t size_a, size_b, alloc_a, alloc_b;
digit *a_digit, *b_digit, *c_digit, *d_digit, *a_end, *b_end;
a = (PyLongObject *)aarg;
b = (PyLongObject *)barg;
size_a = Py_SIZE(a);
size_b = Py_SIZE(b);
if (-2 <= size_a && size_a <= 2 && -2 <= size_b && size_b <= 2) {
Py_INCREF(a);
Py_INCREF(b);
goto simple;
}
/* Initial reduction: make sure that 0 <= b <= a. */
a = (PyLongObject *)long_abs(a);
if (a == NULL)
return NULL;
b = (PyLongObject *)long_abs(b);
if (b == NULL) {
Py_DECREF(a);
return NULL;
}
if (long_compare(a, b) < 0) {
r = a;
a = b;
b = r;
}
/* We now own references to a and b */
alloc_a = Py_SIZE(a);
alloc_b = Py_SIZE(b);
/* reduce until a fits into 2 digits */
while ((size_a = Py_SIZE(a)) > 2) {
nbits = bit_length_digit(a->ob_digit[size_a-1]);
/* extract top 2*PyLong_SHIFT bits of a into x, along with
corresponding bits of b into y */
size_b = Py_SIZE(b);
assert(size_b <= size_a);
if (size_b == 0) {
if (size_a < alloc_a) {
r = (PyLongObject *)_PyLong_Copy(a);
Py_DECREF(a);
}
else
r = a;
Py_DECREF(b);
Py_XDECREF(c);
Py_XDECREF(d);
return (PyObject *)r;
}
x = (((twodigits)a->ob_digit[size_a-1] << (2*PyLong_SHIFT-nbits)) |
((twodigits)a->ob_digit[size_a-2] << (PyLong_SHIFT-nbits)) |
(a->ob_digit[size_a-3] >> nbits));
y = ((size_b >= size_a - 2 ? b->ob_digit[size_a-3] >> nbits : 0) |
(size_b >= size_a - 1 ? (twodigits)b->ob_digit[size_a-2] << (PyLong_SHIFT-nbits) : 0) |
(size_b >= size_a ? (twodigits)b->ob_digit[size_a-1] << (2*PyLong_SHIFT-nbits) : 0));
/* inner loop of Lehmer's algorithm; A, B, C, D never grow
larger than PyLong_MASK during the algorithm. */
A = 1; B = 0; C = 0; D = 1;
for (k=0;; k++) {
if (y-C == 0)
break;
q = (x+(A-1))/(y-C);
s = B+q*D;
t = x-q*y;
if (s > t)
break;
x = y; y = t;
t = A+q*C; A = D; B = C; C = s; D = t;
}
if (k == 0) {
/* no progress; do a Euclidean step */
if (l_mod(a, b, &r) < 0)
goto error;
Py_SETREF(a, b);
b = r;
alloc_a = alloc_b;
alloc_b = Py_SIZE(b);
continue;
}
/*
a, b = A*b-B*a, D*a-C*b if k is odd
a, b = A*a-B*b, D*b-C*a if k is even
*/
if (k&1) {
T = -A; A = -B; B = T;
T = -C; C = -D; D = T;
}
if (c != NULL) {
Py_SET_SIZE(c, size_a);
}
else if (Py_REFCNT(a) == 1) {
c = (PyLongObject*)Py_NewRef(a);
}
else {
alloc_a = size_a;
c = _PyLong_New(size_a);
if (c == NULL)
goto error;
}
if (d != NULL) {
Py_SET_SIZE(d, size_a);
}
else if (Py_REFCNT(b) == 1 && size_a <= alloc_b) {
d = (PyLongObject*)Py_NewRef(b);
Py_SET_SIZE(d, size_a);
}
else {
alloc_b = size_a;
d = _PyLong_New(size_a);
if (d == NULL)
goto error;
}
a_end = a->ob_digit + size_a;
b_end = b->ob_digit + size_b;
/* compute new a and new b in parallel */
a_digit = a->ob_digit;
b_digit = b->ob_digit;
c_digit = c->ob_digit;
d_digit = d->ob_digit;
c_carry = 0;
d_carry = 0;
while (b_digit < b_end) {
c_carry += (A * *a_digit) - (B * *b_digit);
d_carry += (D * *b_digit++) - (C * *a_digit++);
*c_digit++ = (digit)(c_carry & PyLong_MASK);
*d_digit++ = (digit)(d_carry & PyLong_MASK);
c_carry >>= PyLong_SHIFT;
d_carry >>= PyLong_SHIFT;
}
while (a_digit < a_end) {
c_carry += A * *a_digit;
d_carry -= C * *a_digit++;
*c_digit++ = (digit)(c_carry & PyLong_MASK);
*d_digit++ = (digit)(d_carry & PyLong_MASK);
c_carry >>= PyLong_SHIFT;
d_carry >>= PyLong_SHIFT;
}
assert(c_carry == 0);
assert(d_carry == 0);
Py_INCREF(c);
Py_INCREF(d);
Py_DECREF(a);
Py_DECREF(b);
a = long_normalize(c);
b = long_normalize(d);
}
Py_XDECREF(c);
Py_XDECREF(d);
simple:
assert(Py_REFCNT(a) > 0);
assert(Py_REFCNT(b) > 0);
/* Issue #24999: use two shifts instead of ">> 2*PyLong_SHIFT" to avoid
undefined behaviour when LONG_MAX type is smaller than 60 bits */
#if LONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
/* a fits into a long, so b must too */
x = PyLong_AsLong((PyObject *)a);
y = PyLong_AsLong((PyObject *)b);
#elif LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
x = PyLong_AsLongLong((PyObject *)a);
y = PyLong_AsLongLong((PyObject *)b);
#else
# error "_PyLong_GCD"
#endif
x = Py_ABS(x);
y = Py_ABS(y);
Py_DECREF(a);
Py_DECREF(b);
/* usual Euclidean algorithm for longs */
while (y != 0) {
t = y;
y = x % y;
x = t;
}
#if LONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
return PyLong_FromLong(x);
#elif LLONG_MAX >> PyLong_SHIFT >> PyLong_SHIFT
return PyLong_FromLongLong(x);
#else
# error "_PyLong_GCD"
#endif
error:
Py_DECREF(a);
Py_DECREF(b);
Py_XDECREF(c);
Py_XDECREF(d);
return NULL;
}

Categories