Notes and experiments on
the D language,
part II
By leonardo maffi, V.1.1, Jun 13 2007
Keywords: D language, Python, programming
[Go back to the article index]
Software of this article: d_notes2.zip
The D language:
http://www.digitalmars.com/d/
http://en.wikipedia.org/wiki/D_programming_language
See my first article of notes about D: d_notes.html
There is a simple problem often used in the past to compare languages and programmers, the program purpose is essentially to read all different words from a dictionary file, for each letter there was a corresponding digit like the letters on a phone. Using this mapping, read a list of phone numbers from another text file and show all possible combinations of words for that phone number. Some links to the original problem and some comparisons:
http://www.flownet.com/ron/papers/lisp-java/
http://userpages.umbc.edu/~bcorfm1/C++-vs-Lisp.html
http://www.norvig.com/java-lisp.html
Later I have found this problem has already appreared on the D newsgroup too
(and on the D site there is a version done by Lionello Lunesu, on Oct. 18th
2006):
http://www.digitalmars.com/d/archives/digitalmars/D/Lisp_vs._C_not_off-topic_42717.html
First of all I have translated this Lisp version written by Peter Norvig, encoder.lsp:
;; Peter Norvig - Programming Challange from Erann Gat:
;; http://www.flownet.com/ron/papers/lisp-java/
;; Given a list of words and a list of phone numbers, find all the ways that
;; each phone number can be expressed as a list of words.
;; Run: (main "word-list-file-name" "phone-number-file-name")
(defvar *dict* nil
"A hash table mapping a phone number (integer) to a list of words from the
input dictionary that produce that number.")
(defun main (&optional (dict "dict") (nums "nums") (dict-size 100))
"Read the input file ¨DICT and load it into *dict*. Then for each line in
NUMS, print all the translations of the number into a sequence of words,
according to the rules of translation."
(setf *dict* (load-dictionary dict dict-size))
(with-open-file (in nums)
(loop for num = (read-line in nil) while num do
(print-translations num (remove-if-not #'digit-char-p num)))))
(defun print-translations (num digits &optional (start 0) (words nil))
"Print each possible translation of NUM into a string of words. DIGITS
must be WORD with non-digits removed. On recursive calls, START is the
position in DIGITS at which to look for the next word, and WORDS is the list
of words found for (subseq DIGITS 0 START). So if START gets to the end of
DIGITS, then we have a solution in WORDS. Otherwise, for every prefix of
DIGITS, look in the dictionary for word(s) that map to the value of the
prefix (computed incrementally as N), and for each such word try to extend
the solution with a recursive call. There are two complications: (1) the
rules say that in addition to dictionary words, you can use a single
digit in the output, but not two digits in a row. Also (and this seems
silly) you can't have a digit in a place where any word could appear.
I handle this with the variable FOUND-WORD; if it is false after the loop,
and the most recent word is not a digit, try a recursive call that pushes a
digit. (2) The other complication is that the obvious way of mapping
strings to integers would map R to 2 and ER to 02, which of course is
the same integer as 2. Therefore we prepend a 1 to every number, and R
becomes 12 and ER becomes 102."
(if (>= start (length digits))
(format t "~a:~{ ~a~}~%" num (reverse words))
(let ((found-word nil)
(n 1)) ; leading zero problem
(loop for i from start below (length digits) do
(setf n (+ (* 10 n) (nth-digit digits i)))
(loop for word in (gethash n *dict*) do
(setf found-word t)
(print-translations num digits (+ 1 i) (cons word words))))
(when (and (not found-word) (not (numberp (first words))))
(print-translations num digits (+ start 1)
(cons (nth-digit digits start) words))))))
(defun load-dictionary (file size)
"Create a hashtable from the file of words (one per line). Takes a hint
for the initial hashtable size. Each key is the phone number for a word;
each value is a list of words with that phone number."
(let ((table (make-hash-table :test #'eql :size size)))
(with-open-file (in file)
(loop for word = (read-line in nil) while word do
(push word (gethash (word->number word) table))))
table))
(defun word->number (word)
"Translate a word (string) into a phone number, according to the rules."
(let ((n 1)) ; leading zero problem
(loop for i from 0 below (length word)
for ch = (char word i) do
(when (alpha-char-p ch) (setf n (+ (* 10 n) (char->digit ch)))))
n))
(defun nth-digit (digits i)
"The i-th element of a character string of digits, as an integer 0 to 9."
(- (char-code (char digits i)) #.(char-code #\0)))
(defun char->digit (ch)
"Convert a character to a digit according to the phone number rules."
(ecase (char-downcase ch)
((#\e) 0)
((#\j #\n #\q) 1)
((#\r #\w #\x) 2)
((#\d #\s #\y) 3)
((#\f #\t) 4)
((#\a #\m) 5)
((#\c #\i #\v) 6)
((#\b #\k #\u) 7)
((#\l #\o #\p) 8)
((#\g #\h #\z) 9)))
This is the uncommented Python V.2.5 version that uses Psyco too, encoder.py:
import string, collections, psyco, gc
psyco.full()
def load_dictionary(dict_filename):
gtable = string.maketrans("ejnqrwxdsyftamcivbkulopghzEJNQRWXDSYFTAMCIVBKULOPGHZ",
"0111222333445566677788899901112223334455666777888999")
gdict = collections.defaultdict(list)
for word in file(dict_filename):
gdict[word.translate(gtable, '"\n')].append(word[:-1])
return gdict
def print_translations(num, digits, start, words):
if start >= len(digits):
print "%s:" % num, " ".join(words)
else:
found_word = False
for i in xrange(start, len(digits)):
if digits[start:i+1] in gdict:
found_word = True
for hit in gdict[digits[start:i+1]]:
print_translations(num, digits, i+1, words + [hit])
if not found_word and (not words or (words and not words[-1].isdigit())):
print_translations(num, digits, start+1, words + [digits[start]])
gc.disable()
gdict = load_dictionary("dictionary.txt")
for num in file("input.txt"):
print_translations(num[:-1], filter(str.isdigit, num), 0, [])
You can try it with the two input files of the data directory in the zip of
the software of this article, the output is meant as correct if the lines are
in a different order too. That Psyco program needs about 1.64 seconds to run
on a PIII @ 500 MHz with Python V.2.5.1 and Psyco 1.5.2 on Windows (the first
exact translation of the Lisp program required about 11 seconds to run without
Psyco, I have changed many details to make it more pythonic, and I have used
strings instead of integers). As you can see the code is rather short and it
runs very quickly (I have disabled the garbage collector to make it even faster).
Then I have tried to improve its speed, the resulting program is longer, encoder_fast.py:
import string, psyco
psyco.full()
def load_dictionary(dict_filename):
gtable = string.maketrans("ejnqrwxdsyftamcivbkulopghzEJNQRWXDSYFTAMCIVBKULOPGHZ",
"0111222333445566677788899901112223334455666777888999")
gdict = {}
for word in file(dict_filename):
k = word.translate(gtable, '"\n')
if k in gdict:
if type(gdict[k]) == list:
gdict[k].append(word[:-1])
else:
gdict[k] = [gdict[k], word[:-1]]
else:
gdict[k] = word[:-1]
return gdict
def print_translations(num, digits, start, words):
if start >= len(digits):
print "%s:" % num, " ".join(words)
else:
found_word = False
for i in xrange(start, len(digits)):
n = digits[start:i+1]
if n in gdict:
found_word = True
hits = gdict[n]
if type(hits) == list:
for hit in hits:
print_translations(num, digits, i+1, words + [hit])
else:
print_translations(num, digits, i+1, words + [hits])
if not found_word and (not words or (words and not words[-1].isdigit())):
print_translations(num, digits, start+1, words + [digits[start]])
gdict = load_dictionary("dictionary.txt")
for num in file("input.txt"):
print_translations(num[:-1], filter(str.isdigit, num), 0, [])
It works faster becase most words can be associated to a single number, there are just few degenerate cases, that can be managed with a list of words, all the others can be associated with a single string as value of the gdict dictionary. It runs in about 1.03 s on the given data files. That little Python program requires lot of code if you want to translate it to C++, even if you use the STL.
So to improve my so far little D skills, I have translated the first Python program to D, encoder.d:
// Compile with: dmd -O -release encoder.d
import std.stdio, std.stream, std.string, std.ctype, std.gc;
void traduct(char[] n, char[] digits, int start, char[][] words, char[][][char[]] gdict) {
if (start >= digits.length)
writefln(n, ": ", words.join(" "));
else {
auto found_word = false;
for(auto i = start; i < digits.length; i++)
if (digits[start .. i+1] in gdict) {
found_word = true;
foreach(hit; gdict[digits[start .. i+1]])
traduct(n, digits, i+1, words ~ [hit], gdict);
}
if (!found_word && (!words || (words && !std.ctype.isdigit(words[words.length-1][0]))))
traduct(n, digits, start+1, words ~ [digits[start..start+1]], gdict);
}
}
void main() {
std.gc.disable(); // to speed up the program a bit
auto gtable = maketrans("ejnqrwxdsyftamcivbkulopghzEJNQRWXDSYFTAMCIVBKULOPGHZ",
"0111222333445566677788899901112223334455666777888999");
char[][][char[]] gdict;
foreach(char[] w; new BufferedFile("dictionary.txt"))
gdict[w.translate(gtable, "\"")] ~= w.dup;
foreach(char[] n; new BufferedFile("input.txt"))
traduct(n, n.removechars("/-"), 0, [], gdict);
}
That little program is the the last version, the first version was slower and longer. It runs in about 0.72 s. As you can see if you work with strings D is just about as expressive as Python, that's quite a feat. The string library of D is really copied from the string methods of Python, the names are the same and even the order of the parameters is often the same (for example see the maketrans). But the D std.string contains functions and not methods (like the old Python string module), and it contains more functions, some of them are quite useful, like removechars and others that use patterns.
I have tried to speed up the D code using a trick similar to the one I have used in Python, but all my tests have failed to improve the speed of this version (more on this later), it seems the D dynamic arrays are so light that it's not necessary to use other things (my best of such alternative version attempts has an associative array whose values are structs of a boolean plus a union, the union is done among a string and a list of string, the boolean is true if the union contains a list of strings. See files load_dict1.d and load_dict2.d).
I think the D language has some deficiencies and faults:
- It is quite young still, with rough corners here and there, some parts aren't
ripe yet or are implemented in a slow way.
- There aren't books about D, a good book can be quite important to learn well
a language. There is some online documentation on the site, but I think it's
not much organized.
- It is changing still some from one version to the next one (but we have just
passed the version 1.0, we are at the 1.015 when I am writing this).
- It is developed mostly by one an (incredible) person (so it can't grow too
much quickly).
- It's not widespread, for each D programmer there are probably 100 or 1000
Python programmers (and probably 10000 or more Java programmers). So it's not
easy to find routines and modules already written to perform many tasks. So
it's not easy to find a job where you are asked to use D.
- I don't know if it will be successiful, it may even vanish in few years (but
not too much fast, it's already about 8 years old, and lately its usage has
quickly become more widespread).
- It's not a revolutionary language, it's just an improvement, it's essentially
a C++ done right, it doesn't have true new things. While around you can find
some languages that contain new ideas, like Erlang, Oz, Ocaml, F#, Haskell,
etc.
- There are just 1-2 compliers for D, and its portability to other OS is good
but not perfect (there were problem on Mac, it seems).
- A Python programmer can find that programs that mange strings with the normal
string functions are about as fast as the same programs done with Python + Psyco.
But I think it has some advantages too:
- Well written D code (with the DMD compiler) can be on average just 20% slower
than C code compiled with GCC, so for my purposes it's fast enough. It's much
faster than Psyco for numerical code, and it's about as fast as Psyco for hi-level
string processing code, so I can use D to write computationally heavy code too
(that I often can't write with Psyco).
- It is similar to C++, but many things are simplified, so I can probably learn
and use most of it.
- For programs that perform some high-level string processing the code is very
similar to Python code, or it may be even better regarding few small things
(but D isn't a dynamic language, you can't program interactively from the shell,
you can't easely create multy-type data structures, etc (even if you can use
the Box from the D standard lib, that is a kind of variant with a less good
syntax. I think Boo language has better Variants, I think it's called the Duck
type).
- It allows me to write many kinds of programs that I can't create with Python,
like the ones that accurately manage the memory, type conversions, single bits
quick management (but D structs lack C bitfields! It's not good), creation and
management of true pointer structures, char/byte management, etc. And the speed
of the D compiled code may allow me to create fast graphics programs (I am looking
for a 2D graphics lib for D still), data compression, etc. ShedSkin isn't enough
for them.
- With Pyd I can use D functions/classes from Python. It may be useful. Often
Python is more convenient still, D can't never be a full replacement of Python.
- D has really a lot of features and useful "tricks", I am learning
about them still. It is able to do most things you may want to do with a statically
compiled language. It has quite powerful templates (more powerful and simpler
than C++ ones), and strong introspective capabilities, both static (compile
time) and run-time ones too.
- It seems D allows both low-level programming (abous as low as C, expecially
if you disable the GC, etc), and medium-level programming too (classes, metaprogramming,
introspection of the types, automatic memory management, high-level string management,
etc). Today I belive this dual nature is very important for me in a language,
ShedSkin has shown me that it's very useful (and possible) for a language being
able to be used in both situations.
On the editor I usually use a different font for each different language, this helps me tell apart lenguages better, avoiding confusing their syntax. At the moment, coming from Python, I'd like to have (or to implement myself) some hi-level things of Python, like zip, izip, map, imap, enumerate, ienumerate, list(), range, xrange, ecc.
So in the parts of the code where I don't need max speed I can this:
foreach(i; range(start, digits.length)) {
instead of:
for(auto i = start; i < digits.length; i++) {
That is shorter, more readable and it reduces the probability to put bugs (because the syntax is simpler, and you are writing the name of the variabile just 1 time). You have to remember that often 90% of the lines of a program don't need to be very fast (or very memory conscious), so for those lines you can use a simpler syntax that you can write faster with less bugs. For the other 10% of the lines the language can give you lower level syntax that allows you the max speed and max flexibility (like the usual C "for" syntax).
Regarding the D program, using a "translator" that manages significan whitespace as Python, you may write the program like this (with : without ; and without { }). I think there are many advantages in writing the code like this, the code is more compact, more readable (by me) compatto, and it avoids some bugs, like adding a second istruction inside a block while the {} are missing, bugs in the complex parts with many nested if/then/else, bugs where the indentation that means something while the brackets mean something else, etc. This is the code, encoder_whitespacething.~d:
// 22 lines instead of 26
import std.stdio, std.stream, std.string, std.ctype, std.gc
void traduct(char[] n, char[] digits, int start, char[][] words, char[][][char[]] gdict):
if (start >= digits.length):
writefln(n, ": ", words.join(" "))
else:
auto found_word = false
foreach(i; range(start, digits.length)):
if (digits[start .. i+1] in gdict):
found_word = true
foreach(hit; gdict[digits[start .. i+1]]):
traduct(n, digits, i+1, words ~ [hit], gdict)
if (!found_word && (!words || (words && !std.ctype.isdigit(words[words.length-1][0])))):
traduct(n, digits, start+1, words ~ [digits[start..start+1]], gdict)
void main():
std.gc.disable()
auto gtable = maketrans("ejnqrwxdsyftamcivbkulopghzEJNQRWXDSYFTAMCIVBKULOPGHZ",
"0111222333445566677788899901112223334455666777888999")
char[][][char[]] gdict
foreach(char[] w; new BufferedFile("dictionary.txt")):
gdict[w.translate(gtable, "\"")] ~= w.dup
foreach(char[] n; new BufferedFile("input.txt")):
traduct(n, n.removechars("/-"), 0, [], gdict)
Anyway, the D autor says he wants to keep D much similar to C to ease the migration of programmers from C/C++ to D, so he will quite probably not accept to use the significative whitespace (on the other hand it can be created a (simple?) translator that translated code with significan whitespace to normal D code with brackets, and then compiles/runs it).
In the end I have found a way to speed up the D program, this version loads the whole dictionary in memory then iterates on single chars, finding the words. Note that this line:
gdict[word.translate(gtable, "\"")] ~= word;
Doesn't put a string as a value into the associative array, because D manages string copies as just references to the original string. This reduces memory used and speeds up the program. Note that Psyco is able to do similar things, but Psyco is also able to fully hide this semantics from the programmers, while in D you sometimes have to add a .dup to actually copy a string and avoid nasty bugs. Here is the faster version of the D code, encoder_quick.d:
import std.stdio, std.stream, std.string, std.ctype, std.gc;
void traduct(char[] n, char[] digits, int start, char[][] words, char[][][char[]] gdict) {
if (start >= digits.length)
writefln(n, ": ", words.join(" "));
else {
auto found_word = false;
for(auto i = start; i < digits.length; i++)
if (digits[start .. i+1] in gdict) {
found_word = true;
foreach(hit; gdict[digits[start .. i+1]])
traduct(n, digits, i+1, words ~ [hit], gdict);
}
if (!found_word && (!words || (words && !std.ctype.isdigit(words[words.length-1][0]))))
traduct(n, digits, start+1, words ~ [digits[start..start+1]], gdict);
}
}
void main() {
std.gc.disable();
auto gtable = maketrans("ejnqrwxdsyftamcivbkulopghzEJNQRWXDSYFTAMCIVBKULOPGHZ",
"0111222333445566677788899901112223334455666777888999");
size_t line_start;
char[][][char[]] gdict;
auto input_dict = cast(char[])std.file.read("dictionary.txt");
foreach (current_pos, c; input_dict)
if (c == '\n') { // words with DOS newlines too
auto word = input_dict[line_start .. current_pos].strip();
// word isn't a string, it's just a reference (start-end index) to
// the input_dict string, despite being stripped.
gdict[word.translate(gtable, "\"")] ~= word;
line_start = current_pos+1;
}
auto word = input_dict[line_start .. input_dict.length].strip();
if (word.length > 0)
gdict[word.translate(gtable, "\"")] ~= word;
foreach(char[] n; new BufferedFile("input.txt"))
traduct(n, n.removechars("/-"), 0, [], gdict);
}
Summary of the D and Python programs (Python programs on Python 2.5.1, plus Psyco 1.5.2):
|
Program
name
|
Running
time
seconds |
Memory
used
MB |
| encoder.py |
1.64
|
16.9
|
| encoder_fast.py |
1.03
|
12.5
|
| encoder.d |
0.72
|
7.4
|
| encoder_quick.d |
0.61
|
7.1
|
Anyway, I think not Python nor D are fit for the 80-core CPU announced by Intel in the next 4-7 years (even if PyPy can improve the situation with the Python GIL and future array operations of D can improve vector programming on a parallel CPUs). For CPUs a different language may be necessary, and maybe new programmers too ;-)
Regarding the ability to sort items in a modern programming language:
- About 90-95% of the times you have to sort in a quite standard way, but the
language has to give you a way to sort in the other 5-10% of the more complex
situations too.
- About 90% of the times you have to sort small groups of data, and you don't
need to sort it in a very fast way. But the language has to allow you to solve
the remaining 10% situations too, that is it has to offer you a way to sort
a lot of data too and in situations where you want to speed up the sort as much
as possible.
- It's important to write programs as short as possible, Python shows that defaults
help a lot. And a program has to look as simple as possible to speed up the
programming and avoid bugs, so having a simple syntax is quite important.
So there are 4 sorting situations:
1) You have to sort in a simple way, and you don't need a fast sort (and you
don't need a sort that uses as little memory as possible, because you have little
data).
2) You have to sort in a complex way, but you don't need a fast sort or a sort
that uses as little memory as possible.
3) You have to sort in a simple way, but you need a fast sort (or/and you need
a sort that uses as little memory as possible).
4) You have to sort in a complex way, and you need a fast sort or a sort that
uses as little memory as possible.
Some languages seem designed to manage only few of such cases. Python is good to manage cases 1 and 2, you can't reach the max sorting speed of a compiled language. C language is fit for case 4 (the syntax isn't short, so in situation 1 you end writing too much code for a simple and standard situation too). ShedSkin is something in the middle, it is fast too, but there aren't ways to simplify medium-complexity situations (the key and cmp parameters of the sort aren't supported yet). D language may become fit for all situations, but I don't see a way to manage situation 2 in a good way yet (maybe I have to learn more D or maybe D has to improve still).
Here is a more detailed Python-D comparison regarding the various kinds of
sort:
Situation 1:
Python:
print sorted([2, 6, 4, 9])
D:
writefln([2, 6, 4, 9].sort);
Situation 2 and 4:
You can see below, in D the sort in a complex way becomes quite long and convoluted,
so D is fit for situation 4 but not situation 2. Python is very good in situation
2, but not very good in situation 4.
Situation 3:
Sorting in D is quite fast, and in simple situations you can sort using a very
simple syntax, so D is fit for the case 3 too. Python isn't much good in this
situation.
To test the situation 2 I have written some Python code followed by D code that
does the same.
Disclaimer: regarding the D language I am a newbie still, so there may be better
ways to write that D code. This is the Python version, the code shows four ways
to sort the key,value pairs of a very little dictionary (associative array with
D), dict_sort_test.py:
from operator import itemgetter
d = {2:"a", 3:"b", 1:"c", 4:"b"}
print "k,v of d ordered according to the keys:"
print sorted(d.iteritems(), key=itemgetter(0))
# Output: [(1, 'c'), (2, 'a'), (3, 'b'), (4, 'b')]
print "k,v of d ordered according to the values:"
print sorted(d.iteritems(), key=itemgetter(1))
# Output: [(2, 'a'), (3, 'b'), (4, 'b'), (1, 'c')]
print "k,v of d ordered according to (k,v):"
print sorted(d.iteritems())
# Output: [(1, 'c'), (2, 'a'), (3, 'b'), (4, 'b')]
print "k,v of d ordered according to (v,k):"
print sorted(d.iteritems(), key=itemgetter(1, 0))
# Output: [(2, 'a'), (3, 'b'), (4, 'b'), (1, 'c')]
The (very long) D version, dict_sort_test.d:
import std.stdio;
void main() {
char[][int] d = [2:"a", 3:"b", 1:"c", 4:"b"];
writefln("k,v of d ordered according to the keys:");
foreach(k; d.keys.sort)
writef(k, " ", d[k], " ");
writefln();
// Output: 1 c 2 a 3 b 4 b
// -----------------------------------------------
writefln("k,v of d ordered according to the values:");
// Does it exist a shorter way to do this!?
struct Pair1 {
int k;
char[] v;
int opCmp(Pair1 other) {
if (v == other.v)
return 0;
if (v < other.v)
return -1;
else
return 1;
}
}
Pair1[] pairs1;
foreach(k, v; d)
pairs1 ~= Pair1(k, v);
/*
// Longer but faster alternative:
pairs.length = d.length;
size_t i;
foreach(k, v; d) {
pairs[i] = Pair1(k, v);
++i;
}
*/
// writef isn't able to print structs
foreach(p; pairs1.sort)
writef(p.k, " ", p.v, " ");
writefln();
// Output: 2 a 3 b 4 b 1 c
// -----------------------------------------------
writefln("k,v of d ordered according to (k,v):");
struct Pair2 {
int k;
char[] v;
int opCmp(Pair2 other) {
if (k == other.k) {
if (v == other.v)
return 0;
if (v < other.v)
return -1;
else
return 1;
}
if (k < other.k)
return -1;
else
return 1;
}
}
Pair2[] pairs2;
foreach(k, v; d)
pairs2 ~= Pair2(k, v);
foreach(p; pairs2.sort)
writef(p.k, " ", p.v, " ");
writefln();
// Output: 1 c 2 a 3 b 4 b
// -----------------------------------------------
writefln("k,v of d ordered according to (v,k):");
struct Pair3 {
int k;
char[] v;
int opCmp(Pair3 other) {
if (v == other.v) {
if (k == other.k)
return 0;
if (k < other.k)
return -1;
else
return 1;
}
if (v < other.v)
return -1;
else
return 1;
}
}
Pair3[] pairs3;
foreach(k, v; d)
pairs3 ~= Pair3(k, v);
foreach(p; pairs3.sort)
writef(p.k, " ", p.v, " ");
writefln();
// Output: 2 a 3 b 4 b 1 c
}
As you can see D requires about 10 times lines of code to sort such tiny dictionary
in a way a bit more complex than usual (but maybe a person that knows D more
than me can find a way to write less code).
Conclusion: ShedSkin and D can be both become fit to manage all four situations.
They can manage the key and cmp parameters of the sort too, to be fit for the
common situation 2 too. (For the situation 4 they are already fit, you can use
a struct/class with a comparing function, etc, when sorting speed is essential
you can tolerate to write more code).
Software of this article: d_notes2.zip