How to run rust function in python program - python

I'm working with a xorcipher app to benchmark both python and rust function but I don't know how to add the rust function to python code can anyone help I need the save the output in buf I'm including my rust code
use std::convert::TryInto;
/*
Apply a simple XOR cipher using they specified `key` of size
`key_size`, to the `msg` char/byte array of size `msg_len`.
Writes he ciphertext to the externally allocated buffer `buf`.
*/
#[no_mangle]
pub unsafe fn cipher(msg: *const i8, key: *const i8, buf: *mut i8, msg_len: usize, key_len: usize)
{
let mut i: isize = 0;
while i < msg_len.try_into().unwrap() {
let key_len_i8: i8 = key_len.try_into().unwrap();
*buf.offset(i) = *msg.offset(i) ^ (*key.offset(i) % key_len_i8);
i = i + 1;
}
}

Let's say your rust code is named rs_cipher.rs.
If you compile it with this command
rustc --crate-type dylib rs_cipher.rs
you obtain a native dynamic library for your system.
(of course, this can also be done with cargo and the crate-type = ["cdylib"] option)
Here is a python code that loads this library, finds the function and calls it (see the comments for these different stages).
import sys
import os
import platform
import ctypes
import traceback
# choose application specific library name
lib_name='rs_cipher'
# determine system system specific library name
if platform.system().startswith('Windows'):
sys_lib_name='%s.dll'%lib_name
elif platform.system().startswith('Darwin'):
sys_lib_name='lib%s.dylib'%lib_name
else:
sys_lib_name='lib%s.so'%lib_name
# try to load native library from various locations
# (current directory, python file directory, system path...)
ntv_lib=None
for d in [os.path.curdir,
os.path.dirname(sys.modules[__name__].__file__),
'']:
path=os.path.join(d, sys_lib_name)
try:
ntv_lib=ctypes.CDLL(path)
break
except:
# traceback.print_exc()
pass
if ntv_lib is None:
sys.stderr.write('cannot load library %s\n'%lib_name)
sys.exit(1)
# try to find native function
fnct_name='cipher'
ntv_fnct=None
try:
ntv_fnct=ntv_lib[fnct_name]
except:
# traceback.print_exc()
pass
if ntv_fnct is None:
sys.stderr.write('cannot find function %s in library %s\n'%
(fnct_name, lib_name))
sys.exit(1)
# describe native function prototype
ntv_fnct.restype=None # no return value
ntv_fnct.argtypes=[ctypes.c_void_p, # msg
ctypes.c_void_p, # key
ctypes.c_void_p, # buf
ctypes.c_size_t, # msg_len
ctypes.c_size_t] # key_len
# use native function
msg=(ctypes.c_int8*10)()
key=(ctypes.c_int8*4)()
buf=(ctypes.c_int8*len(msg))()
for i in range(len(msg)):
msg[i]=1+10*i
for i in range(len(key)):
key[i]=~(20*(i+2))
sys.stdout.write('~~~~ first initial state ~~~~\n')
sys.stdout.write('msg: %s\n'%[v for v in msg])
sys.stdout.write('key: %s\n'%[v for v in key])
sys.stdout.write('buf: %s\n'%[v for v in buf])
sys.stdout.write('~~~~ first call ~~~~\n')
ntv_fnct(msg, key, buf, len(msg), len(key))
sys.stdout.write('msg: %s\n'%[v for v in msg])
sys.stdout.write('key: %s\n'%[v for v in key])
sys.stdout.write('buf: %s\n'%[v for v in buf])
(msg, buf)=(buf, msg)
for i in range(len(buf)):
buf[i]=0
sys.stdout.write('~~~~ second initial state ~~~~\n')
sys.stdout.write('msg: %s\n'%[v for v in msg])
sys.stdout.write('key: %s\n'%[v for v in key])
sys.stdout.write('buf: %s\n'%[v for v in buf])
sys.stdout.write('~~~~ second call ~~~~\n')
ntv_fnct(msg, key, buf, len(msg), len(key))
sys.stdout.write('msg: %s\n'%[v for v in msg])
sys.stdout.write('key: %s\n'%[v for v in key])
sys.stdout.write('buf: %s\n'%[v for v in buf])
Running this python code shows this result
~~~~ first initial state ~~~~
msg: [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]
key: [-41, -61, -81, -101]
buf: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
~~~~ first call ~~~~
msg: [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]
key: [-41, -61, -81, -101]
buf: [-42, -56, -70, -124, -2, -16, -110, -36, -122, -104]
~~~~ second initial state ~~~~
msg: [-42, -56, -70, -124, -2, -16, -110, -36, -122, -104]
key: [-41, -61, -81, -101]
buf: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
~~~~ second call ~~~~
msg: [-42, -56, -70, -124, -2, -16, -110, -36, -122, -104]
key: [-41, -61, -81, -101]
buf: [1, 11, 21, 31, 41, 51, 61, 71, 81, 91]
Note that I replaced these two lines in your rust code
let key_len_i8: i8 = key_len.try_into().unwrap();
*buf.offset(i) = *msg.offset(i) ^ (*key.offset(i) % key_len_i8);
by these
let key_len_isize: isize = key_len.try_into().unwrap();
*buf.offset(i) = *msg.offset(i) ^ (*key.offset(i % key_len_isize));
because the offset on key was going out of bounds.
Maybe, instead of dealing with pointer arithmetic and unsigned/signed conversions, should you build usual slices from the raw pointers and use them in the safe way?
#[no_mangle]
pub fn cipher(
msg_ptr: *const i8,
key_ptr: *const i8,
buf_ptr: *mut i8,
msg_len: usize,
key_len: usize,
) {
let msg = unsafe { std::slice::from_raw_parts(msg_ptr, msg_len) };
let key = unsafe { std::slice::from_raw_parts(key_ptr, key_len) };
let buf = unsafe { std::slice::from_raw_parts_mut(buf_ptr, msg_len) };
for i in 0..msg_len {
buf[i] = msg[i] ^ key[i % key_len];
}
}

Related

I got Errors in C++ when try to use modules from python scripts

I have a python script that read IMU data from the sensor. This is the python script:
import Microcontroller_Manager_Serial as Serial
import Microcontroller_Manager_Bluetooth as Bluetooth
## Function which get the current accelerometer full-scale value
def IMU_Get_Accelerometer_Full_Scale(communication_mode):
data_out = [81, 58, 0, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which set the current accelerometer full-scale value
def IMU_Set_Accelerometer_Full_Scale(communication_mode,choice):
data_out = [81, 58, 1, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get the current gyroscope full-scale value
def IMU_Get_Gyroscope_Full_Scale(communication_mode):
data_out = [82, 58, 0, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which set the current gyroscope full-scale value
def IMU_Set_Gyroscope_Full_Scale(communication_mode,choice):
data_out = [82, 58, 1, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get measurement from the IMU sensor
def IMU_Get_Values(communication_mode,choice):
data_out = [83, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 30
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get the sensivity adjustment parameters for the magnetometer
def IMU_Get_Magnetometer_Parameters(communication_mode):
data_out = [84, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
So from that python Script I need a C++ code that can ask for IMU data update and publish it. This is my C++ code
#include <stdio.h>
#include <string.h>
#include <wiringSerial.h>
export module Microcontroller_Manager_Serial.Serial;
int Communication_Mode_ = 0;
double data_recieved=0;
int main()
{
int serial_port = serialOpen ("/dev/serial0", 115200);
if (serial_port > 0 ) {
data_recieved = IMU.IMU_Get_Values(1, 1)
printf("Raw data received: ", data_received)
}
}
But I got the following errors:
get_data_test.cpp:5:8: error: ‘module’ does not name a type; did you
mean ‘double’? export module Microcontroller_Manager_Serial.Serial;
^~~~~~
double get_data_test.cpp: In function ‘int main()’: get_data_test.cpp:20:20: error: ‘IMU’ was not declared in this scope
data_recieved = IMU.IMU_Get_Values(1, 1)
Any help?

How do I encrypt and decrypt a string in python 3.7?

I've found this exactly same question. But PyCrypto doesn't install both on python 3.6.5 and 3.7.0.
So, I implement some kind of Gronsfeld-like cipher. I know, that's awful, but I can simply encrypt and derypt string with password
def encrypt(string, password):
int_list = []
password_len = len(password)
for cnt, sym in enumerate(string):
password_sym = password[cnt % password_len]
int_list.append(ord(sym)-ord(password_sym))
return int_list
# got some list which contain mine key to Todoist api, yes, this can be bruteforced, but same as any other API key
>>> [-20, -20, -50, -14, -61, -54, 2, 0, 32, 27, -51, -21, -54, -53, 4, 3, 29, -14, -51, 29, -10, -6, 1, 4, 28,
29, -55, -17, -59, -42, 2, 50, -13, -14, -52, -15, -56, -59, -44, 4]
def decrypt(int_list, password):
output_string = ""
password_len = len(password)
for cnt, numb in enumerate(int_list):
password_sym = password[cnt % password_len]
output_string += chr(numb+ord(password_sym))
return output_string
So, how to do it properly?
Cryptography is an actively developed library that provides cryptographic recipes and primitives. It supports Python 2.6-2.7, Python 3.3+ and PyPy.
Installation
$ pip install cryptography
Example code using high level symmetric encryption recipe:
from cryptography.fernet import Fernet
key = Fernet.generate_key()
cipher_suite = Fernet(key)
cipher_text = cipher_suite.encrypt(b"A really secret message. Not for prying eyes.")
plain_text = cipher_suite.decrypt(cipher_text)

Python NRF24 convert Payload

since some days I try to connect an Arduino with a Raspberry by an NRF24 module.
By the examples i got some working code in C (#raspberry) but I want to use python due to the web app.
The problem is I can't interpret the msq send by the Arduino. The Arduino should send 8 times uint32_t in one payload. After receive in Raspberry I got 8 times 4 uint8_t.
Arduino MSG:
uint32_t SendMsg[8] = {222, micros(), 10, 20, 30, 40, 50, 60};
Raspberry RCV:
Received: [222, 0, 0, 0, 24, 22, 97, 210, 10, 0, 0, 0, 20, 0, 0, 0, 30, 0, 0, 0, 40, 0, 0, 0, 50, 0, 0, 0, 60, 0, 0, 0]
The Raspberry "destroy" the int32 to 4 times int8 ... how I may combine them again?
The RCV is a list of int.
Here the complete code:
Raspberry:
import RPi.GPIO as GPIO
from lib_nrf24 import NRF24
import time
import spidev
GPIO.setmode(GPIO.BCM)
pipes = [0xAB, 0xCD, 0xAB, 0xCD, 0x71]
radio = NRF24(GPIO, spidev.SpiDev())
radio.begin(0, 22)
radio.setPayloadSize(32)
radio.setChannel(77)
radio.setDataRate(NRF24.BR_1MBPS)
radio.setPALevel(NRF24.PA_MIN)
radio.setAutoAck(True)
radio.enableDynamicPayloads()
radio.enableAckPayload()
radio.openReadingPipe(1,pipes)
radio.printDetails()
radio.startListening()
while(1):
# ackPL = [1]
while not radio.available(0):
time.sleep(1 / 100)
receivedMessage = []
radio.read(receivedMessage, radio.getDynamicPayloadSize())
print("Received: {}".format(receivedMessage))
Arduino
//Send.ino
#include<SPI.h>
#include<RF24.h>
// ce, csn pins
RF24 radio(6, 10);
uint32_t RcvMsg[8] = {0, 0, 0, 0, 0, 0, 0, 0};
void setup(void){
radio.begin();
radio.setPALevel(RF24_PA_MAX);
radio.setChannel(77);
radio.openWritingPipe(0xABCDABCD71LL);
radio.enableDynamicPayloads();
radio.powerUp();
}
void loop(void){
const char text[] = "Hello World is awe DOAs";
uint32_t SendMessage;
uint32_t SendMsg[8] = {222, micros(), 10, 20, 30, 40, 50, 60};
radio.write(&SendMsg, sizeof(SendMsg));
if ( radio.isAckPayloadAvailable() ) {
radio.read(&RcvMsg, sizeof(RcvMsg));
}
Serial.print(" SendMsg[0]: ");Serial.print(SendMsg[0]); Serial.print(" SendMsg[1]: ");Serial.print(SendMsg[1]); Serial.print(" SendMsg[2]: ");Serial.print(SendMsg[2]); Serial.print(" SendMsg[3]: ");Serial.print(SendMsg[3]); Serial.print(" SendMsg[4]: ");Serial.print(SendMsg[4]);Serial.print(" SendMsg[5]: ");Serial.print(SendMsg[5]);Serial.print(" SendMsg[6]: ");Serial.print(SendMsg[6]); Serial.print(" SendMsg[7]: ");Serial.println(SendMsg[7]);
Serial.print(" RcvMsg[0] : ");Serial.print(RcvMsg[0]);Serial.print(" RcvMsg[1] : ");Serial.print(RcvMsg[1]);Serial.print(" RcvMsg[2] : ");Serial.print(RcvMsg[2]);Serial.print(" RcvMsg[3] : ");Serial.print(RcvMsg[3]); Serial.print(" RcvMsg[4] : ");Serial.print(RcvMsg[4]);Serial.print(" RcvMsg[5] : ");Serial.print(RcvMsg[5]);Serial.print(" RcvMsg[6] : ");Serial.print(RcvMsg[6]);Serial.print(" RcvMsg[7] : ");Serial.println(RcvMsg[7]);
delay(1000);
}
I found a solution:
#============================================================================#
#============================================================================#
# translate radio Msq from byte to int
#============================================================================#
def translate_from_radio(msg, size):
translated_msg=[]
for i in range (0, size, 4):
translated_msg.append(int.from_bytes([msg[i+3], msg[i+2], msg[i+1], msg[i]], byteorder='big'))
#print("Translate FROM Radio: " + str(msg) + " --> " + str(translated_msg))
return translated_msg
#============================================================================#
#============================================================================#
# Split the msg element in 4 bytes and add it to translated msg
#============================================================================#
def translate_to_radio(msg):
translated_msg=[]
for i in range (0, len(msg)):
x=msg[i].to_bytes(4, byteorder='big')
for g in reversed(x):
translated_msg.append(g)
#print("Translate TO Radio: " + str(msg) + " --> " + str(translated_msg))
return translated_msg

Controlling many servos using Arduino and Python over serial continuously

I've been working on this piece of code to control a large array of servos on Arduino using Python.
The code works, but I've seen some strange behavior surrounding timing.
Between each update to the servos, I need a sleep command.
If I don't insert a sleep, the program will work the first time I run it, but if the Python code gets stopped, then I try to connect over serial with Python again, the Arduino is non responsive.
What could be happening to the serial port and how could I prevent this from happening?
Also, am I parsing the data coming into the Arduino the most efficient way possible using memcpy? Or is there a better or more standard way to do this?
I've read about using serialEvent, is there a benefit to using that command for parsing serial data in this situation?
//ARDUINO CODE
#include <Servo.h>
int servoPins[] = {9, 10};
//int servoPins[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38};
//int servoPins[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34};
const int numServos = sizeof(servoPins)/sizeof(int);
Servo servos[numServos];
char serialData[numServos * 3];
char tempData[3];
void setup() {
Serial.begin(9600);
Serial.println("Ready");
for (int i=0; i < numServos; i++) {
servos[i].attach(servoPins[i]);
//servos[i].write(20);
}
}
void loop() {
if (Serial.available()) {
Serial.readBytesUntil('\0', serialData, numServos * 3);
for (int i=0; i < numServos; i++) {
memmove(tempData, serialData + i * 3, 3);
servos[i].write(atoi(tempData));
}
}
}
#PYTHON CODE
""" Control n servos on n arduinos over serial
"""
import glob
import platform
import serial
from time import sleep
import sys
'''
Servo
Id number, angle for servo, Arduino port servo is on
For now the port is used as an index, so please number them going from 0-n
'''
class Servo:
id_num = 0
angle = 0
port = 0
def __init__(self, id_num, angle, port):
self.id_num = id_num
self.angle = angle
self.port = port
'''
ServoDriver
-Stores a list of servos
-Open ports to Arduinos
-Stores a list of those ports
-Creates a map of which servos are on which ports
-Provides a way to update the angle of all servos on all ports
'''
class ServoDriver:
def __init__(self, servos):
self.ports = self.open_ports()
self.servos = servos
# looks for all devices that have the same name pattern as an Arduino and opens them
def open_ports(self):
# find arduinos
# note: currently this is Mac only
devices = glob.glob('/dev/tty.usbmodem*')
print devices
if len(devices) == 0:
print "No Arduinos found"
sys.ext(1)
ports = []
for device in devices:
try:
# connect to serial port
ports.append(serial.Serial(device, 9600))
except:
print 'Failed to open port'
sys.ext(1)
# need a short delay right after serial port is started for the Arduino to initialize
sleep(2)
return ports
# update the angle of all servos on all ports
def update(self, servos):
debug = True
servo_data = []
for p in self.ports:
servo_data.append('')
for servo_update in servos:
for servo_stored in self.servos:
if servo_stored.id_num == servo_update.id_num:
this_port = servo_stored.port
break
# constrain servo angles to avoid damaging servos
if servo_update.angle > 135:
servo_update.angle = 135
else if servo_update.angle < 45:
servo_update.angle = 45
# append angle to the datum for this port
servo_data[this_port] = servo_data[this_port] + str(servo_update.angle).zfill(3)
for servo_datum in servo_data:
# append null byte for arduino to recognize end of data
servo_datum = servo_datum + "\0"
# send data to the Arduinos
for port,servo_datum in zip(self.ports,servo_data):
port.write(servo_datum)
def close_ports(self):
print 'closing ports'
for port in self.ports:
port.close()
# generates values for making a servo sweep back and forth
def servo_iter():
l = []
#for i in range(0,1):
l.append(40)
#for i in range(0,1):
l.append(80)
for pos in l:
yield pos
def servo_iter_2(total):
for i in range(0,total):
yield i
if __name__ == "__main__":
# create a list of servos with mappings to ports
# if you have the wrong number of servos it acts weird
#num_servos = 32
num_servos = 2
servos = []
servos.append(Servo(0, 40, 0))
servos.append(Servo(1, 40, 0))
'''
servos.append(Servo(2, 40, 0))
servos.append(Servo(3, 40, 0))
servos.append(Servo(4, 40, 0))
servos.append(Servo(5, 40, 0))
servos.append(Servo(6, 40, 0))
servos.append(Servo(7, 40, 0))
servos.append(Servo(8, 40, 0))
servos.append(Servo(9, 40, 0))
servos.append(Servo(10, 40, 0))
servos.append(Servo(11, 40, 0))
servos.append(Servo(12, 40, 0))
servos.append(Servo(13, 40, 0))
servos.append(Servo(14, 40, 0))
servos.append(Servo(15, 40, 0))
servos.append(Servo(16, 40, 0))
servos.append(Servo(17, 40, 0))
servos.append(Servo(18, 40, 0))
servos.append(Servo(19, 40, 0))
servos.append(Servo(20, 40, 0))
servos.append(Servo(21, 40, 0))
servos.append(Servo(22, 40, 0))
servos.append(Servo(23, 40, 0))
servos.append(Servo(24, 40, 0))
servos.append(Servo(25, 40, 0))
servos.append(Servo(26, 40, 0))
servos.append(Servo(27, 40, 0))
servos.append(Servo(28, 40, 0))
servos.append(Servo(29, 40, 0))
servos.append(Servo(30, 40, 0))
servos.append(Servo(31, 40, 0))
servos.append(Servo(32, 40, 0))
'''
#if len(servos) != num_servos:
# print 'wrong number of servos'
# sys.ext(1)
# comment out the next two lines if you only have 1 arduino
#servos.append(Servo(2, 40, 1))
#servos.append(Servo(3, 40, 1))
#servos.append(Servo(4, 40, 2))
#servos.append(Servo(5, 40, 2))
angles = []
for i in range(0,len(servos)):
angles.append(40)
try:
# instantiate a driver
# must happen inside try-finally
driver = ServoDriver(servos)
iter1 = False
if iter1:
pos = servo_iter()
else:
pos = servo_iter_2(len(servos))
while True:
try:
x = pos.next()
except StopIteration:
if iter1:
pos = servo_iter()
else:
pos = servo_iter_2(len(servos))
x = pos.next()
# create a list of servos with ids and angles to update positions of servos
if iter1:
for servo in servos:
servo.angle = x
else:
for i,servo in zip(angles,servos):
servo.angle = i
# call the driver with the list of servos
driver.update(servos)
sleep(0.5)
for i in range(0, len(servos)):
if i == x:
angles[i] = 80
else:
angles[i] = 40
# close the serial port on exit, or you will have to unplug the arduinos to connect again
finally:
driver.close_ports()
I think the right answer here was to add a delay to the Arduino.
Your Arduino will reset every time you make a serial connection over usb. If you want a temporary disable, you can insert a 10 microfarad capacitor between ground and reset. This will keep it from auto-reseting, and can be easily removed for program uploading.

generate list of text content with pyqt?

I have loaded HTML into pyqt and would like to create a list of all the content on the page.
I then need to be able to get the position of the text, using .geometry()
I would like a list of objects, where the following would be possible:
for i in list_of_content_in_html:
print i.toPlainText(), i.geometry() #prints the text, and the position.
In case I am unclear, by "contents" I mean in the HTML below, contents is
'c', 'r1 c1', 'r1, c2', 'row2 c2', 'more contents' - the text the web user sees in the browser, basically.
c
<table border="1">
<tr>
<td>r1 c1</td>
<td>r1 c2</td>
</tr>
<tr>
<td></td>
<td>row2 c2</td>
</tr>
</table>
more contents
This doesn't seem to be possible using QtWebKit and pages like this one, that nest objects but don't use <p>...</p> for other text, that is outside of the table. In result c and more contents don't go into separate QWebElements. They are only to be found in the BODY level block. As a solution one could run that page through a parser. Simply traversing through children of currentFrame documentElement brings out following elements:
# position in element tree, bounding box, tag, text:
(0, 0) [0, 0, 75, 165] HTML - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(1, 1) [8, 8, 67, 157] BODY - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(2, 0) [8, 27, 75, 119] TABLE - u'r1 c1\tr1 c2\nrow2 c2'
(3, 0) [9, 28, 74, 118] TBODY - u'r1 c1\tr1 c2\nrow2 c2'
(4, 0) [9, 30, 74, 72] TR - u'r1 c1\tr1 c2'
(5, 0) [11, 30, 32, 72] TD - u'r1 c1'
(5, 1) [34, 30, 72, 72] TD - u'r1 c2'
(4, 1) [9, 74, 74, 116] TR - u'row2 c2'
(5, 1) [34, 74, 72, 116] TD - u'row2 c2'
Code for this:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
class WebPage(QObject):
finished = Signal()
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
#Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elem = frame.documentElement()
self.gather_info(elem)
self.finished.emit()
def gather_info(self, elem, i=0):
if i > 200: return
cnt = 0
while cnt < 100:
s = elem.toPlainText()
rect = elem.geometry()
name = elem.tagName()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s: self.output.append(dict(pos=(i, cnt), dim=dim, tag=name, text=s))
child = elem.firstChild()
if not child.isNull():
self.gather_info(child, i+1)
elem = elem.nextSibling()
if elem.isNull():
break
cnt += 1
webpage = None
def print_strings():
for s in webpage.output:
print s['pos'], s['dim'], s['tag'], '-', repr(s['text'])
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.finished.connect(print_strings)
webpage.start()
.
A different approach
The desired course of action depends on what you want to achieve. You can get all the strings from the QWebPage using webpage.currentFrame().documentElement().toPlainText(), but that just shows the whole page as a string with no positioning information related to all the tags. Browsing the QWebElement tree gives you the desired information but it has the drawbacks, which I mentioned above.
If you really want to know the position of all text, The only accurate way to do this (other than rendering the page and using OCR) is breaking text into characters and saving their individual bounding boxes. Here's how I did it:
First I parsed the page with BeautifulSoup4 and enclosed every non-space text character X in a <span class="Nd92KSx3u2">X</span>. Then I ran a PyQt script (actually a PySide script) which loads the altered page and printed out the characters with their bounding boxes after I looked them up using findAllElements('span[class="Nd92KSx3u2"]').
parser.py:
import sys, cgi, re
from bs4 import BeautifulSoup, element
magical_class = "Nd92KSx3u2"
restricted_tags="title script object embed".split()
re_my_span = re.compile(r'<span class="%s">(.+?)</span>' % magical_class)
def no_nl(s): return str(s).replace("\r", "").replace("\n", " ")
if len(sys.argv) != 3:
print "Usage: %s <input_html_file> <output_html_file>" % sys.argv[0]
sys.exit(1)
def process(elem):
for x in elem.children:
if isinstance(x, element.Comment): continue
if isinstance(x, element.Tag):
if x.name in restricted_tags:
continue
if isinstance(x, element.NavigableString):
if not len(no_nl(x.string).strip()):
continue # it's just empty space
print '[', no_nl(x.string).strip(), ']', # debug output of found strings
s = ""
for c in x.string:
if c in (' ', '\r', '\n', '\t'): s += c
else: s += '<span class="%s">%s</span>' % (magical_class, c)
x.replace_with(s)
continue
process(x)
soup = BeautifulSoup(open(sys.argv[1]))
process(soup)
output = re_my_span.sub(r'<span class="%s">\1</span>' % magical_class, str(soup))
with open(sys.argv[2], 'w') as f:
f.write(output)
charpos.py:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
magical_class = "Nd92KSx3u2"
class WebPage(QObject):
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
#Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elements = frame.findAllElements('span[class="%s"]' % magical_class)
for e in elements:
s = e.toPlainText()
rect = e.geometry()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s and rect.width() > 0 and rect.height() > 0: print dim, s
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.start()
input.html (slightly altered to show more problems with simple string dumping:
a<span>b<span>c</span></span>
<table border="1">
<tr><td>r1 <font>c1</font> </td><td>r1 c2</td></tr>
<tr><td></td><td>row2 & c2</td></tr>
</table>
more <b>contents</b>
and the test run:
$ python parser.py input.html temp.html
[ a ] [ b ] [ c ] [ r1 ] [ c1 ] [ r1 c2 ] [ row2 & c2 ] [ more ] [ contents ]
$ charpos.py temp.html
[8, 8, 17, 26] a
[17, 8, 26, 26] b
[26, 8, 34, 26] c
[13, 48, 18, 66] r
[18, 48, 27, 66] 1
[13, 67, 21, 85] c
[21, 67, 30, 85] 1
[36, 48, 41, 66] r
[41, 48, 50, 66] 1
[36, 67, 44, 85] c
[44, 67, 53, 85] 2
[36, 92, 41, 110] r
[41, 92, 50, 110] o
[50, 92, 61, 110] w
[61, 92, 70, 110] 2
[36, 111, 47, 129] &
[51, 111, 59, 129] c
[59, 111, 68, 129] 2
[8, 135, 21, 153] m
[21, 135, 30, 153] o
[30, 135, 35, 153] r
[35, 135, 44, 153] e
[8, 154, 17, 173] c
[17, 154, 27, 173] o
[27, 154, 37, 173] n
[37, 154, 42, 173] t
[42, 154, 51, 173] e
[51, 154, 61, 173] n
[61, 154, 66, 173] t
[66, 154, 75, 173] s
Looking at the bounding boxes, it is (in this simple case without changes in font size and things like subscripts) quite easy to glue them back into words if you wish.
I worked it out.
for elem in QWebView().page().currentFrame().documentElement().findAll('*'):
print unicode(elem.toPlainText()), unicode(elem.geometry().getCoords()), '\n'
It matches anything, and then iterates over what is found - thereby iterating over the DOM tree.

Categories