I've been working on this piece of code to control a large array of servos on Arduino using Python.
The code works, but I've seen some strange behavior surrounding timing.
Between each update to the servos, I need a sleep command.
If I don't insert a sleep, the program will work the first time I run it, but if the Python code gets stopped, then I try to connect over serial with Python again, the Arduino is non responsive.
What could be happening to the serial port and how could I prevent this from happening?
Also, am I parsing the data coming into the Arduino the most efficient way possible using memcpy? Or is there a better or more standard way to do this?
I've read about using serialEvent, is there a benefit to using that command for parsing serial data in this situation?
//ARDUINO CODE
#include <Servo.h>
int servoPins[] = {9, 10};
//int servoPins[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38};
//int servoPins[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34};
const int numServos = sizeof(servoPins)/sizeof(int);
Servo servos[numServos];
char serialData[numServos * 3];
char tempData[3];
void setup() {
Serial.begin(9600);
Serial.println("Ready");
for (int i=0; i < numServos; i++) {
servos[i].attach(servoPins[i]);
//servos[i].write(20);
}
}
void loop() {
if (Serial.available()) {
Serial.readBytesUntil('\0', serialData, numServos * 3);
for (int i=0; i < numServos; i++) {
memmove(tempData, serialData + i * 3, 3);
servos[i].write(atoi(tempData));
}
}
}
#PYTHON CODE
""" Control n servos on n arduinos over serial
"""
import glob
import platform
import serial
from time import sleep
import sys
'''
Servo
Id number, angle for servo, Arduino port servo is on
For now the port is used as an index, so please number them going from 0-n
'''
class Servo:
id_num = 0
angle = 0
port = 0
def __init__(self, id_num, angle, port):
self.id_num = id_num
self.angle = angle
self.port = port
'''
ServoDriver
-Stores a list of servos
-Open ports to Arduinos
-Stores a list of those ports
-Creates a map of which servos are on which ports
-Provides a way to update the angle of all servos on all ports
'''
class ServoDriver:
def __init__(self, servos):
self.ports = self.open_ports()
self.servos = servos
# looks for all devices that have the same name pattern as an Arduino and opens them
def open_ports(self):
# find arduinos
# note: currently this is Mac only
devices = glob.glob('/dev/tty.usbmodem*')
print devices
if len(devices) == 0:
print "No Arduinos found"
sys.ext(1)
ports = []
for device in devices:
try:
# connect to serial port
ports.append(serial.Serial(device, 9600))
except:
print 'Failed to open port'
sys.ext(1)
# need a short delay right after serial port is started for the Arduino to initialize
sleep(2)
return ports
# update the angle of all servos on all ports
def update(self, servos):
debug = True
servo_data = []
for p in self.ports:
servo_data.append('')
for servo_update in servos:
for servo_stored in self.servos:
if servo_stored.id_num == servo_update.id_num:
this_port = servo_stored.port
break
# constrain servo angles to avoid damaging servos
if servo_update.angle > 135:
servo_update.angle = 135
else if servo_update.angle < 45:
servo_update.angle = 45
# append angle to the datum for this port
servo_data[this_port] = servo_data[this_port] + str(servo_update.angle).zfill(3)
for servo_datum in servo_data:
# append null byte for arduino to recognize end of data
servo_datum = servo_datum + "\0"
# send data to the Arduinos
for port,servo_datum in zip(self.ports,servo_data):
port.write(servo_datum)
def close_ports(self):
print 'closing ports'
for port in self.ports:
port.close()
# generates values for making a servo sweep back and forth
def servo_iter():
l = []
#for i in range(0,1):
l.append(40)
#for i in range(0,1):
l.append(80)
for pos in l:
yield pos
def servo_iter_2(total):
for i in range(0,total):
yield i
if __name__ == "__main__":
# create a list of servos with mappings to ports
# if you have the wrong number of servos it acts weird
#num_servos = 32
num_servos = 2
servos = []
servos.append(Servo(0, 40, 0))
servos.append(Servo(1, 40, 0))
'''
servos.append(Servo(2, 40, 0))
servos.append(Servo(3, 40, 0))
servos.append(Servo(4, 40, 0))
servos.append(Servo(5, 40, 0))
servos.append(Servo(6, 40, 0))
servos.append(Servo(7, 40, 0))
servos.append(Servo(8, 40, 0))
servos.append(Servo(9, 40, 0))
servos.append(Servo(10, 40, 0))
servos.append(Servo(11, 40, 0))
servos.append(Servo(12, 40, 0))
servos.append(Servo(13, 40, 0))
servos.append(Servo(14, 40, 0))
servos.append(Servo(15, 40, 0))
servos.append(Servo(16, 40, 0))
servos.append(Servo(17, 40, 0))
servos.append(Servo(18, 40, 0))
servos.append(Servo(19, 40, 0))
servos.append(Servo(20, 40, 0))
servos.append(Servo(21, 40, 0))
servos.append(Servo(22, 40, 0))
servos.append(Servo(23, 40, 0))
servos.append(Servo(24, 40, 0))
servos.append(Servo(25, 40, 0))
servos.append(Servo(26, 40, 0))
servos.append(Servo(27, 40, 0))
servos.append(Servo(28, 40, 0))
servos.append(Servo(29, 40, 0))
servos.append(Servo(30, 40, 0))
servos.append(Servo(31, 40, 0))
servos.append(Servo(32, 40, 0))
'''
#if len(servos) != num_servos:
# print 'wrong number of servos'
# sys.ext(1)
# comment out the next two lines if you only have 1 arduino
#servos.append(Servo(2, 40, 1))
#servos.append(Servo(3, 40, 1))
#servos.append(Servo(4, 40, 2))
#servos.append(Servo(5, 40, 2))
angles = []
for i in range(0,len(servos)):
angles.append(40)
try:
# instantiate a driver
# must happen inside try-finally
driver = ServoDriver(servos)
iter1 = False
if iter1:
pos = servo_iter()
else:
pos = servo_iter_2(len(servos))
while True:
try:
x = pos.next()
except StopIteration:
if iter1:
pos = servo_iter()
else:
pos = servo_iter_2(len(servos))
x = pos.next()
# create a list of servos with ids and angles to update positions of servos
if iter1:
for servo in servos:
servo.angle = x
else:
for i,servo in zip(angles,servos):
servo.angle = i
# call the driver with the list of servos
driver.update(servos)
sleep(0.5)
for i in range(0, len(servos)):
if i == x:
angles[i] = 80
else:
angles[i] = 40
# close the serial port on exit, or you will have to unplug the arduinos to connect again
finally:
driver.close_ports()
I think the right answer here was to add a delay to the Arduino.
Your Arduino will reset every time you make a serial connection over usb. If you want a temporary disable, you can insert a 10 microfarad capacitor between ground and reset. This will keep it from auto-reseting, and can be easily removed for program uploading.
Related
I have a python script that read IMU data from the sensor. This is the python script:
import Microcontroller_Manager_Serial as Serial
import Microcontroller_Manager_Bluetooth as Bluetooth
## Function which get the current accelerometer full-scale value
def IMU_Get_Accelerometer_Full_Scale(communication_mode):
data_out = [81, 58, 0, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which set the current accelerometer full-scale value
def IMU_Set_Accelerometer_Full_Scale(communication_mode,choice):
data_out = [81, 58, 1, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get the current gyroscope full-scale value
def IMU_Get_Gyroscope_Full_Scale(communication_mode):
data_out = [82, 58, 0, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which set the current gyroscope full-scale value
def IMU_Set_Gyroscope_Full_Scale(communication_mode,choice):
data_out = [82, 58, 1, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get measurement from the IMU sensor
def IMU_Get_Values(communication_mode,choice):
data_out = [83, 58, choice, 58, 13, 10]
data_in = [48]
current_data_size = 30
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
## Function which get the sensivity adjustment parameters for the magnetometer
def IMU_Get_Magnetometer_Parameters(communication_mode):
data_out = [84, 13, 10]
data_in = [48]
current_data_size = 10
current_time_out = 0.5
if (communication_mode == 2):
Bluetooth.Bluetooth_Send_Data(bytes(data_out))
data_in = Bluetooth.Bluetooth_Receive_Data(current_data_size,current_time_out)
else:
Serial.Serial_Port_Send_Data(data_out)
data_in = Serial.Serial_Port_Receive_Data(current_data_size,current_time_out)
return data_in
So from that python Script I need a C++ code that can ask for IMU data update and publish it. This is my C++ code
#include <stdio.h>
#include <string.h>
#include <wiringSerial.h>
export module Microcontroller_Manager_Serial.Serial;
int Communication_Mode_ = 0;
double data_recieved=0;
int main()
{
int serial_port = serialOpen ("/dev/serial0", 115200);
if (serial_port > 0 ) {
data_recieved = IMU.IMU_Get_Values(1, 1)
printf("Raw data received: ", data_received)
}
}
But I got the following errors:
get_data_test.cpp:5:8: error: ‘module’ does not name a type; did you
mean ‘double’? export module Microcontroller_Manager_Serial.Serial;
^~~~~~
double get_data_test.cpp: In function ‘int main()’: get_data_test.cpp:20:20: error: ‘IMU’ was not declared in this scope
data_recieved = IMU.IMU_Get_Values(1, 1)
Any help?
Hello i tried to make fuzzy system by scikit-fuzzy, the output seems running well but when i try to figure it according scikit-fuzzy new api, my plot is not responding. I remember last time when I using scikit-fuzzy my plot is running well, whats wrong? Do my code is lack something?
import numpy as np
import skfuzzy as fuzz
from skfuzzy import control as ctrl
# crisp set
keramaian = ctrl.Antecedent(np.arange(0, 30, 1), 'KERAMAIAN')
ukuran = ctrl.Antecedent(np.arange(0, 25, 1), 'UKURAN')
vol = ctrl.Consequent(np.arange(0, 100, 1), 'UKURAN VOLUME')
# fuzzyfication
ukuran['Kecil'] = fuzz.trimf(ukuran.universe, [0, 0, 13])
ukuran['Sedang'] = fuzz.trimf(ukuran.universe, [0, 12, 24])
ukuran['Besar'] = fuzz.trimf(ukuran.universe, [12, 24, 24])
keramaian['sunyi'] = fuzz.trimf(keramaian.universe, [0, 0, 15])
keramaian['Cukup Ramai'] = fuzz.trimf(keramaian.universe, [0, 15, 29])
keramaian['Berisik'] = fuzz.trimf(keramaian.universe, [15, 29, 29])
vol['Pelan'] = fuzz.trimf(vol.universe, [0, 0, 40])
vol['Sedang'] = fuzz.trimf(vol.universe, [30, 50, 70])
vol['Kencang'] = fuzz.trimf(vol.universe, [60, 99, 99])
# rule set
rule1 = ctrl.Rule(ukuran['Kecil'] & keramaian['Berisik'], vol['Kencang'])
rule2 = ctrl.Rule(ukuran['Kecil'] & keramaian['Cukup Ramai'], vol['Kencang'])
rule3 = ctrl.Rule(ukuran['Kecil'] & keramaian['Cukup Ramai'], vol['Kencang'])
rule4 = ctrl.Rule(ukuran['Sedang'] & keramaian['Berisik'], vol['Sedang'])
rule5 = ctrl.Rule(ukuran['Sedang'] & keramaian['Cukup Ramai'], vol['Sedang'])
rule6 = ctrl.Rule(ukuran['Sedang'] & keramaian['sunyi'], vol['Sedang'])
rule7 = ctrl.Rule(ukuran['Besar'] & keramaian['Berisik'], vol['Pelan'])
rule8 = ctrl.Rule(ukuran['Besar'] & keramaian['Cukup Ramai'], vol['Pelan'])
rule9 = ctrl.Rule(ukuran['Besar'] & keramaian['sunyi'], vol['Pelan'])
vol_suara_ctrl = ctrl.ControlSystem([rule1, rule2, rule3, rule4, rule5, rule6, rule7, rule8, rule9])
vol_suara = ctrl.ControlSystemSimulation(vol_suara_ctrl)
keramaian['sunyi'].view()
vol_suara.input['KERAMAIAN'] = int(input("Masukkan jumlah keramaian?\n"))
vol_suara.input['UKURAN'] = int(input("Masukkan ukuran speaker?\n"))
# defuzification
vol_suara.compute()
vol.view(sim=vol_suara)
print(vol_suara.output['UKURAN VOLUME'])
Import import matplotlib.pyplot as plt
And try to use plt.show() at the end of your code.
I'm writing my first multiprocessing program in python.
I want to create a list of values to be processed, and 8 processes (number os CPU cores) will consume and process the list of values.
I wrote the following python code:
__author__ = 'Rui Martins'
from multiprocessing import cpu_count, Process, Lock, Value
def proc(lock, number_of_active_processes, valor):
lock.acquire()
number_of_active_processes.value+=1
print "Active processes:", number_of_active_processes.value
lock.release()
# DO SOMETHING ...
for i in range(1, 100):
valor=valor**2
# (...)
lock.acquire()
number_of_active_processes.value-=1
lock.release()
if __name__ == '__main__':
proc_number=cpu_count()
number_of_active_processes=Value('i', 0)
lock = Lock()
values=[11, 24, 13, 40, 15, 26, 27, 8, 19, 10, 11, 12, 13]
values_processed=0
processes=[]
for i in range(proc_number):
processes+=[Process()]
while values_processed<len(values):
while number_of_active_processes.value < proc_number and values_processed<len(values):
for i in range(proc_number):
if not processes[i].is_alive() and values_processed<len(values):
processes[i] = Process(target=proc, args=(lock, number_of_active_processes, values[values_processed]))
values_processed+=1
processes[i].start()
while number_of_active_processes.value == proc_number:
# BUG: always number_of_active_processes.value == 8 :(
print "Active processes:", number_of_active_processes.value
print ""
print "Active processes at END:", number_of_active_processes.value
And, I have the following problem:
The program never stop
I get out of RAM
Simplifying your code to the following:
def proc(lock, number_of_active_processes, valor):
lock.acquire()
number_of_active_processes.value += 1
print("Active processes:", number_of_active_processes.value)
lock.release()
# DO SOMETHING ...
for i in range(1, 100):
print(valor)
valor = valor **2
# (...)
lock.acquire()
number_of_active_processes.value -= 1
lock.release()
if __name__ == '__main__':
proc_number = cpu_count()
number_of_active_processes = Value('i', 0)
lock = Lock()
values = [11, 24, 13, 40, 15, 26, 27, 8, 19, 10, 11, 12, 13]
values_processed = 0
processes = [Process() for _ in range(proc_number)]
while values_processed < len(values)-1:
for p in processes:
if not p.is_alive():
p = Process(target=proc,
args=(lock, number_of_active_processes, values[values_processed]))
values_processed += 1
p.start()
If you run it like above the print(valor) added you see exactly what is happening, you are exponentially growing valor to the point you run out of memory, you don't get stuck in the while you get stuck in the for loop.
This is the output at the 12th process adding a print(len(srt(valor))) after a fraction of a second and it just keeps on going:
2
3
6
11
21
.........
59185
70726
68249
73004
77077
83805
93806
92732
90454
104993
118370
136498
131073
Just changing your loop to the following:
for i in range(1, 100):
print(valor)
valor = valor *2
The last number created is:
6021340351084089657109340225536
Using your own code you seem to get stuck in the while but it is valor is growing in the for loop to numbers with as many digits as:
167609
180908
185464
187612
209986
236740
209986
And on....
The problem is not your multiprocessing code. It's the pow operator in the for loop:
for i in range(1, 100):
valor=valor**2
the final result would be pow(val, 2**100), and this is too big, and calculate it would cost too much time and memory. so you got out of memory error in the last.
4 GB = 4 * pow(2, 10) * pow(2, 10) * pow(2, 20) * 8 bit = 2**35 bit
and for your smallest number 8:
pow(8, 2**100) = pow(2**3, 2**100) = pow(2, 3*pow(2, 100))
pow(2, 3*pow(2, 100))bit/4GB = 3*pow(2, 100-35) = 3*pow(2, 65)
it need 3*pow(2, 65) times of 4 GB memory.
from multiprocessing import Process , Queue
from datetime import datetime
c = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
out = Queue()
def support(m):
for k in m :
print "%s <-- hi" % k
out.put("done")
all = Queue()
temp = []
total = len(c)
count = 0
for m in c :
count += 1
total = total - 1
temp.append(m)
if count == 5 or total == 0 :
all.put(temp)
count = 0
temp = []
process_count = 3
while all.qsize() != 0 :
process_list = []
try :
for x in range(process_count) :
p = Process(target=support, args=(all.get(),))
process_list.append(p)
for p in process_list :
p.start()
for p in process_list :
p.join()
except Exception as e :
print e
while out.qsize != 0 :
print out.get()
print "all done"
I dont know why it does not end and does not print "all done" , just remain continuously in loop or keep executing .
Will be of great help if you can make this code more efficient but first i want to know why it does not end .
The problem is:
while out.qsize != 0 :
print out.get()
out.qsize is a function, so now you're comparing the function itself (not the return value!) with 0, with is of course always False.
You should use:
while out.qsize() != 0 :
print out.get()
I have loaded HTML into pyqt and would like to create a list of all the content on the page.
I then need to be able to get the position of the text, using .geometry()
I would like a list of objects, where the following would be possible:
for i in list_of_content_in_html:
print i.toPlainText(), i.geometry() #prints the text, and the position.
In case I am unclear, by "contents" I mean in the HTML below, contents is
'c', 'r1 c1', 'r1, c2', 'row2 c2', 'more contents' - the text the web user sees in the browser, basically.
c
<table border="1">
<tr>
<td>r1 c1</td>
<td>r1 c2</td>
</tr>
<tr>
<td></td>
<td>row2 c2</td>
</tr>
</table>
more contents
This doesn't seem to be possible using QtWebKit and pages like this one, that nest objects but don't use <p>...</p> for other text, that is outside of the table. In result c and more contents don't go into separate QWebElements. They are only to be found in the BODY level block. As a solution one could run that page through a parser. Simply traversing through children of currentFrame documentElement brings out following elements:
# position in element tree, bounding box, tag, text:
(0, 0) [0, 0, 75, 165] HTML - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(1, 1) [8, 8, 67, 157] BODY - u'c\nr1 c1\tr1 c2\nrow2 c2\nmore contents'
(2, 0) [8, 27, 75, 119] TABLE - u'r1 c1\tr1 c2\nrow2 c2'
(3, 0) [9, 28, 74, 118] TBODY - u'r1 c1\tr1 c2\nrow2 c2'
(4, 0) [9, 30, 74, 72] TR - u'r1 c1\tr1 c2'
(5, 0) [11, 30, 32, 72] TD - u'r1 c1'
(5, 1) [34, 30, 72, 72] TD - u'r1 c2'
(4, 1) [9, 74, 74, 116] TR - u'row2 c2'
(5, 1) [34, 74, 72, 116] TD - u'row2 c2'
Code for this:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
class WebPage(QObject):
finished = Signal()
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
#Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elem = frame.documentElement()
self.gather_info(elem)
self.finished.emit()
def gather_info(self, elem, i=0):
if i > 200: return
cnt = 0
while cnt < 100:
s = elem.toPlainText()
rect = elem.geometry()
name = elem.tagName()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s: self.output.append(dict(pos=(i, cnt), dim=dim, tag=name, text=s))
child = elem.firstChild()
if not child.isNull():
self.gather_info(child, i+1)
elem = elem.nextSibling()
if elem.isNull():
break
cnt += 1
webpage = None
def print_strings():
for s in webpage.output:
print s['pos'], s['dim'], s['tag'], '-', repr(s['text'])
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.finished.connect(print_strings)
webpage.start()
.
A different approach
The desired course of action depends on what you want to achieve. You can get all the strings from the QWebPage using webpage.currentFrame().documentElement().toPlainText(), but that just shows the whole page as a string with no positioning information related to all the tags. Browsing the QWebElement tree gives you the desired information but it has the drawbacks, which I mentioned above.
If you really want to know the position of all text, The only accurate way to do this (other than rendering the page and using OCR) is breaking text into characters and saving their individual bounding boxes. Here's how I did it:
First I parsed the page with BeautifulSoup4 and enclosed every non-space text character X in a <span class="Nd92KSx3u2">X</span>. Then I ran a PyQt script (actually a PySide script) which loads the altered page and printed out the characters with their bounding boxes after I looked them up using findAllElements('span[class="Nd92KSx3u2"]').
parser.py:
import sys, cgi, re
from bs4 import BeautifulSoup, element
magical_class = "Nd92KSx3u2"
restricted_tags="title script object embed".split()
re_my_span = re.compile(r'<span class="%s">(.+?)</span>' % magical_class)
def no_nl(s): return str(s).replace("\r", "").replace("\n", " ")
if len(sys.argv) != 3:
print "Usage: %s <input_html_file> <output_html_file>" % sys.argv[0]
sys.exit(1)
def process(elem):
for x in elem.children:
if isinstance(x, element.Comment): continue
if isinstance(x, element.Tag):
if x.name in restricted_tags:
continue
if isinstance(x, element.NavigableString):
if not len(no_nl(x.string).strip()):
continue # it's just empty space
print '[', no_nl(x.string).strip(), ']', # debug output of found strings
s = ""
for c in x.string:
if c in (' ', '\r', '\n', '\t'): s += c
else: s += '<span class="%s">%s</span>' % (magical_class, c)
x.replace_with(s)
continue
process(x)
soup = BeautifulSoup(open(sys.argv[1]))
process(soup)
output = re_my_span.sub(r'<span class="%s">\1</span>' % magical_class, str(soup))
with open(sys.argv[2], 'w') as f:
f.write(output)
charpos.py:
import sys
from PySide.QtCore import *
from PySide.QtGui import *
from PySide.QtWebKit import *
magical_class = "Nd92KSx3u2"
class WebPage(QObject):
def __init__(self, data, parent=None):
super(WebPage, self).__init__(parent)
self.output = []
self.data = data
self.page = QWebPage()
self.page.loadFinished.connect(self.process)
def start(self):
self.page.mainFrame().setHtml(self.data)
#Slot(bool)
def process(self, something=False):
self.page.setViewportSize(self.page.mainFrame().contentsSize())
frame = self.page.currentFrame()
elements = frame.findAllElements('span[class="%s"]' % magical_class)
for e in elements:
s = e.toPlainText()
rect = e.geometry()
dim = [rect.x(), rect.y(),
rect.x() + rect.width(), rect.y() + rect.height()]
if s and rect.width() > 0 and rect.height() > 0: print dim, s
if __name__ == '__main__':
app = QApplication(sys.argv)
data = open(sys.argv[1]).read()
webpage = WebPage(data)
webpage.start()
input.html (slightly altered to show more problems with simple string dumping:
a<span>b<span>c</span></span>
<table border="1">
<tr><td>r1 <font>c1</font> </td><td>r1 c2</td></tr>
<tr><td></td><td>row2 & c2</td></tr>
</table>
more <b>contents</b>
and the test run:
$ python parser.py input.html temp.html
[ a ] [ b ] [ c ] [ r1 ] [ c1 ] [ r1 c2 ] [ row2 & c2 ] [ more ] [ contents ]
$ charpos.py temp.html
[8, 8, 17, 26] a
[17, 8, 26, 26] b
[26, 8, 34, 26] c
[13, 48, 18, 66] r
[18, 48, 27, 66] 1
[13, 67, 21, 85] c
[21, 67, 30, 85] 1
[36, 48, 41, 66] r
[41, 48, 50, 66] 1
[36, 67, 44, 85] c
[44, 67, 53, 85] 2
[36, 92, 41, 110] r
[41, 92, 50, 110] o
[50, 92, 61, 110] w
[61, 92, 70, 110] 2
[36, 111, 47, 129] &
[51, 111, 59, 129] c
[59, 111, 68, 129] 2
[8, 135, 21, 153] m
[21, 135, 30, 153] o
[30, 135, 35, 153] r
[35, 135, 44, 153] e
[8, 154, 17, 173] c
[17, 154, 27, 173] o
[27, 154, 37, 173] n
[37, 154, 42, 173] t
[42, 154, 51, 173] e
[51, 154, 61, 173] n
[61, 154, 66, 173] t
[66, 154, 75, 173] s
Looking at the bounding boxes, it is (in this simple case without changes in font size and things like subscripts) quite easy to glue them back into words if you wish.
I worked it out.
for elem in QWebView().page().currentFrame().documentElement().findAll('*'):
print unicode(elem.toPlainText()), unicode(elem.geometry().getCoords()), '\n'
It matches anything, and then iterates over what is found - thereby iterating over the DOM tree.