By leonardo maffi
V.1.0, May 13 2009
keywords: programming, D language, Python, Psyco, C language, benchmarks
[Go back to the article index]
Once in a while in my programs I need to keep an ordered sequence of items, process them all iteratively, and during each iteration remove few of them that aren't good anymore. So in each iteration only small number of items is removed, and such removals happen in random positions.
To focus on a basic version of such problem, the items can be random positive integers, the operation can decrease them by 1, and they can be removed when they become 0. The iterations stop when there are no items left in the sequence.
Here you can find all the code shown below (the zip may contain code more up to date compared to the code shown in this page, so use the zip as the reference code):
http://www.fantascienza.net/leonard
If you don't need to keep the order of the items then a good solution is just to keep them in a fixed-size array, and replace each removed one with good items taken from the end of the array (you need to keep the used length of the array).
In Python if you need to keep the items ordered you can keep them into a Python list (that is an a dynamic array), and in each iteration create a new array of the good items:
# Python program #1
from random import randrange
from timeit import default_timer as clock
def generate(nitems, k):
return [randrange(k) for i in xrange(nitems)]
def map_filter(items):
return [x-1 for x in items if x > 1]
def main(nitems, k):
items = generate(nitems, k)
t = clock()
while items:
items = map_filter(items)
if nitems <= 300:
print items, "\n"
print round(clock() - t, 2)
import psyco; psyco.full()
main(400000, 400)
# unused Python function
def map_filter(items):
i = 0
while i < len(items):
if items[i] > 0:
items[i] -= 1
i += 1
else:
del items[i:i+1]
# from Python program #2
def map_filter(items):
nzeros = 0
for i in xrange(len(items)):
if items[i] > 0:
items[i] -= 1
else:
nzeros += 1
if nzeros > 0:
if (float(nzeros) / len(items)) > 0.1:
items = filter(None, items)
return items
// program D #1
import std.c.stdio: printf;
// dlibs: http://www.fantascienza.net/leonardo/so/libs_d.zip
import d.random: fastRandInt;
import d.time: clock;
import d.builders: ArrayBuilder;
int[] generate(int nitems, int k) {
auto result = new int[nitems];
for (int i; i < nitems; i++)
result[i] = fastRandInt(k-1);
return result;
}
bool map_filter(ref int[] items) {
ArrayBuilder!(int) temp;
foreach (item; items)
if (item > 1)
temp ~= item - 1;
items = temp.toarray;
return items.length != 0;
}
void main() {
auto data = generate(400_000, 400);
auto t = clock();
while (map_filter(data)) {}
printf("%f\n", clock() - t);
}
// program D #2
import std.c.stdio: printf;
import d.random: randInt;
import d.time: clock;
struct Node {
int data;
Node* next;
}
Node* generate(int nnodes, int k) {
auto nodes = new Node[nnodes];
for (int i = 0; i < nnodes - 1; i++)
nodes[i] = Node(randInt(k-1), &(nodes[i+1]));
nodes[$-1] = Node(randInt(k-1), null);
return nodes.ptr;
}
void show_list(Node* nodes) {
if (nodes !is null)
for (; nodes; nodes = nodes.next)
printf("%d ", nodes.data);
printf("\n");
}
Node* map_filter(Node* nodes) {
Node* result;
while (nodes != null) {
if (nodes.data > 0) {
nodes.data--;
Node* temp = nodes.next;
nodes.next = result;
result = nodes;
nodes = temp;
} else {
nodes = nodes.next;
}
}
return result;
}
void main() {
auto data = generate(400_000, 400);
//show_list(data);
auto t = clock();
do {
data = map_filter(data);
//show_list(data);
} while (data != null);
printf("%f\n", clock() - t);
}
// program D #3
import d.random: fastRandInt;
import d.time: clock;
import d.builders: ArrayBuilder;
import std.stdio: writefln;
int[] generate(int nitems, int k) {
auto result = new int[nitems];
for (int i; i < nitems; i++)
result[i] = fastRandInt(k-1);
return result;
}
bool map_filter(ref int[] items) {
static bool ignoring_update(int[] items) {
bool modified = false;
int i = 0;
for (; i < items.length; i++)
if (items[i] > 0) {
modified = true;
items[i]--;
break;
}
for (; i < items.length; i++)
if (items[i] > 0)
items[i]--;
return modified;
}
static bool compacting_update(ref int[] items) {
ArrayBuilder!(int) temp;
foreach (item; items)
if (item > 1)
temp ~= item - 1;
items = temp.toarray;
return items.length != 0;
}
if (items.length < 300)
return ignoring_update(items);
// compute approximate stats
int nprobes = items.length / 20;
nprobes = nprobes > 5000 ? 5000 : nprobes;
int nempty;
for (int i; i < nprobes; i++)
if (items[i] <= 0)
nempty++;
if ((cast(double)nempty / cast(double)nprobes) < 0.07)
// less than 7% are bad, found experimentally. This is a
// very small percentage. Maybe a higher percentage breaks
// too much the future instructions done by the cpu
return ignoring_update(items);
else
return compacting_update(items);
}
/// compute full stats
int stats(int[] items) {
int nempty;
foreach (el; items)
if (el <= 0)
nempty++;
return nempty;
}
void main() {
auto data = generate(400_000, 400);
auto t = clock();
while (map_filter(data)) {
//writefln(data, "\n");
//int empty = stats(data);
//writefln("len, empty: ", data.length, " ", empty);
}
writefln(clock() - t);
}
// program D #4
import std.stdio: writefln;
// dlibs: http://www.fantascienza.net/leonardo/so/libs_d.zip
import d.random: fastRandInt;
import d.time: clock;
int[] generate(int nitems, int k) {
auto result = new int[nitems];
for (int i; i < nitems; i++)
result[i] = fastRandInt(k-1);
return result;
}
bool map_filter(ref int[] items, ref int[] aux) {
aux.length = items.length;
int pos;
foreach (item; items)
if (item > 1)
aux[pos++] = item - 1;
aux.length = pos;
// swap
int[] temp = items;
items = aux;
aux = temp;
return pos != 0;
}
void main() {
const int n = 400_000, k = 400;
auto data = generate(n, k);
int[] aux;
auto t = clock();
while (map_filter(data, aux))
if (n <= 400)
writefln(data, "\n");
writefln(clock() - t);
}
# Python program #3
from random import randrange
from timeit import default_timer as clock
def generate(nitems, k):
return [randrange(k) for i in xrange(nitems)]
def map_filter(items, len_items, aux):
pos = 0
for i in xrange(len_items):
if items[i] > 1:
aux[pos] = items[i] - 1
pos += 1
items, aux = aux, items
return items, pos, aux
def main(nitems, k):
items = generate(nitems, k)
aux = [None] * len(items)
len_items = nitems
if nitems <= 300: print items, "\n"
t = clock()
while len_items:
items, len_items, aux = map_filter(items, len_items, aux)
if nitems <= 300: print len_items, items, "\n"
print round(clock() - t, 2)
import psyco; psyco.full()
main(400000, 400)
// program D #5
import std.stdio: writefln;
// dlibs: http://www.fantascienza.net/leonardo/so/libs_d.zip
import d.random: fastRandInt;
import d.time: clock;
int[] generate(int nitems, int k) {
auto result = new int[nitems];
for (int i; i < nitems; i++)
result[i] = fastRandInt(k-1);
return result;
}
bool map_filter(ref int[] items, ref int[] aux) {
static bool ignoring_update(int[] items) {
bool modified = false;
int i = 0;
for (; i < items.length; i++)
if (items[i] > 0) {
modified = true;
items[i]--;
break;
}
for (; i < items.length; i++)
if (items[i] > 0)
items[i]--;
return modified;
}
if (items.length < 200)
return ignoring_update(items);
// compute approximate stats
int nprobes = items.length / 20;
nprobes = nprobes > 5000 ? 5000 : nprobes;
int nempty;
for (int i; i < nprobes; i++)
if (items[i] <= 0)
nempty++;
if ((cast(double)nempty / cast(double)nprobes) < 0.04) {
// 4% of the items are zero, or less
return ignoring_update(items);
} else {
// compacting update
aux.length = items.length;
int pos;
foreach (item; items)
if (item > 1)
aux[pos++] = item - 1;
aux.length = pos;
// swap
int[] temp = items;
items = aux;
aux = temp;
return pos != 0;
}
}
void main() {
const int n = 400_000, k = 400;
auto data = generate(n, k);
if (n <= 400) writefln(data, "\n");
int[] aux;
auto t = clock();
while (map_filter(data, aux))
if (n <= 400) writefln(data, "\n");
writefln(clock() - t);
}
// program D #6
import std.c.stdio: printf;
import d.random: randInt;
import d.time: clock;
struct Node {
int data;
Node* next;
}
Node* generate(int nnodes, int k) {
auto nodes = new Node[nnodes];
for (int i = 0; i < nnodes - 1; i++)
nodes[i] = Node(randInt(k-1), &(nodes[i+1]));
nodes[$-1] = Node(randInt(k-1), null);
return nodes.ptr;
}
void show_list(Node* nodes) {
if (nodes !is null)
for (; nodes; nodes = nodes.next)
printf("%d ", nodes.data);
printf("\n\n");
}
Node* map_filter(Node* nodes) {
// skip the first empty items
while (nodes != null && nodes.data < 2)
nodes = nodes.next;
if (nodes != null) {
nodes.data--; // process the current item
Node* cursor = nodes; // one node before the current item
while (cursor.next != null) {
if (cursor.next.data < 2)
cursor.next = cursor.next.next; // remove item
else {
cursor.next.data--; // process item
cursor = cursor.next; // step forward
}
}
}
return nodes;
}
void main() {
auto data = generate(400_000, 400);
//show_list(data);
auto t = clock();
do {
data = map_filter(data);
//show_list(data);
} while (data != null);
printf("%f\n", clock() - t);
}
Timings nitems=400_000, k=400, seconds, best of 3:
list_deletions3_py: 18.73 (no Psyco)
list_deletions2_py: 15.93 (no Psyco)
list_deletions1_py: 10.43 (no Psyco) (57.9 X)
list_deletions1_py: 5.37 (30 X)
list_deletions3_py: 2.36
list_deletions2_py: 2.30 (12.7 X)
list_deletions6_d: 0.75 (0.86 total)
list_deletions1_d: 0.73
list_deletions2_d: 0.60
list_deletions1_c: 0.59
list_deletions4_d: 0.26
list_deletions3_d: 0.22 (0.43 total)
list_deletions5_d: 0.18 (0.25 total) ( 1 X)
list_deletions2_c: 0.17 (0.24 total)
Timings nitems=4_000_000, k=1500, seconds, best of 3:
list_deletions3_d: 8.36 (8.58 total)
list_deletions2_c: 7.80 (7.95 total)
list_deletions5_d: 7.79 (8.08 total)
CPU: Intel Core 2, 2 GHz (2 cores), WinXP SP2.
D code compiled with:
DMD v1.042 and D 2.029
dmd -O -release -inline
Python 2.6.2 (r262:71605, Apr 14 2009, 22:40:02)
[MSC v.1500 32 bit (Intel)] on win32
Psyco for Python 2.6, V.1.6.0 final 0