""" bitarray.py -- V.1.4, Dec 21 2006, by leonardo maffi. Bitarray class. The original code was from Zope Corporation and Contributors: http://svn.zope.org/*checkout*/Zope3/trunk/src/zope/textindex/ricecode.py?content-type=text%2Fplain&rev=8532 But I have improved and modified it so much that now it's new code. TODO: Possibile improvements: - __str__ can be made faster - Methods: copy, extend - Overloaded operators: xor, and, etc. - add Frozenbitarray class too? -------------------------------------------------- Some Doctests: Simple first test, creation, assign, repr, str, iteration: >>> ba = Bitarray(nbits=10) >>> ba Bitarray(nbits=10, buf='\\x00\\x00') >>> for bit in ba: print bit, 0 0 0 0 0 0 0 0 0 0 >>> print ba 0000000000 First general test, assign, read, str, repr: >>> b = Bitarray(nbits=50) >>> print b 00000000000000000000000000000000000000000000000000 >>> b Bitarray(nbits=50, buf='\\x00\\x00\\x00\\x00\\x00\\x00\\x00') >>> b[0] = 1 >>> print b 10000000000000000000000000000000000000000000000000 >>> b Bitarray(nbits=50, buf='\\x01\\x00\\x00\\x00\\x00\\x00\\x00') >>> b[2] = 1 >>> print b 10100000000000000000000000000000000000000000000000 >>> b[10] = 1 >>> print b[10] 1 >>> print b 10100000001000000000000000000000000000000000000000 >>> b[10] = 0 >>> print b 10100000000000000000000000000000000000000000000000 >>> print b[10] 0 >>> len(b) 50 Test set, reset: >>> b.set() >>> b Bitarray(nbits=50, buf='\\xff\\xff\\xff\\xff\\xff\\xff\\xff') >>> b.reset() >>> b Bitarray(nbits=50, buf='\\x00\\x00\\x00\\x00\\x00\\x00\\x00') Test iteration, __getstate__, __setstate__: >>> b1 = Bitarray(buf="foo") >>> [bit for bit in b1] [0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0] >>> state = b1.__getstate__() >>> state (24, 'foo') >>> b2 = Bitarray() >>> b2.__setstate__(state) >>> b1 == b2 True Test append, len, coherent state, toint: >>> b = Bitarray(nbits=50) >>> b[0] = 1 >>> b[2] = 1 >>> print b 10100000000000000000000000000000000000000000000000 >>> b.toint() 5 >>> b.nbits 50 >>> len(b.bytes) 7 >>> b.append(1) >>> b.append(0) >>> b.append(1) >>> print b 10100000000000000000000000000000000000000000000000101 >>> b.nbits 53 >>> len(b.bytes) 7 >>> b.toint() 5629499534213125L Out of bound assign test: >>> b[100] = 1 Traceback (most recent call last): ... IndexError: Bitarray index out of range >>> b[-1] = 1 Traceback (most recent call last): ... IndexError: Bitarray index out of range Out of bound read test: >>> b[100] Traceback (most recent call last): ... IndexError: Bitarray index out of range >>> b[-1] Traceback (most recent call last): ... IndexError: Bitarray index out of range Repr + eval test: >>> b = Bitarray(buf="hello"*10) >>> eval(repr(b)) == b True Test that requires lot of memory (and some time): >>> b = Bitarray(nbits = 30*1000*1000*8) >>> print len(b) 240000000 """ from array import array from math import ceil from itertools import islice, imap, repeat, chain, izip # bit_conv is used to convert the bits given to the Bitarray.frombits from string import whitespace bit_conv = dict.fromkeys(whitespace) for bit in [0, "0", False]: bit_conv[bit] = 0 for bit in [1, "1", True]: bit_conv[bit] = 1 del bit # _bytes2ibits is used by Bitarray.__iter__ to convert bytes to (reversed) 0/1 digits _nibbles = {"0":"0000", "1":"0001", "2":"0010", "3":"0011", "4":"0100", "5":"0101", "6":"0110", "7":"0111", "8":"1000", "9":"1001", "A":"1010", "B":"1011", "C":"1100", "D":"1101", "E":"1110", "F":"1111"} _bytes2bits = ["".join(_nibbles[nibble] for nibble in "%02X" % n)[::-1] for n in xrange(256)] # _setbits is the number of 1 bits in the binary representation of a number. # It's used by Bitarray.hamming _setbits = [bits.count("1") for bits in _bytes2bits] _bytes2ibits = [map(int, byte) for byte in _bytes2bits] del _bytes2bits class Bitarray(object): """Bitarray(nbits=None, buf=None): class to manage a dynamic array of bits. If 'buf'=None and 'nbits'!=None, it creates an empty Bitbuffer of nbits bits. If 'buf' is a string or sequence of unsigned bytes and nbits=None, it uses all their bits to initialize the Bitbuffer. If 'buf' is defined, and nbits!=None, it creates a buffer of 'nbits' bits, and fills it with the given data, appending zeros or taking only part of 'buf' as needed.""" # Note that the less significant bits of each byte are used first by the Bitarray def __init__(self, nbits=None, buf=None): """ __init__ test 1: >>> b = Bitarray(0) >>> assert len(b) == 0 >>> assert str(b) == "" __init__ test 2: >>> b = Bitarray() >>> assert len(b) == 0 >>> assert str(b) == "" __init__ test 3: >>> b = Bitarray(10) >>> assert len(b) == 10 >>> print b 0000000000 __init__ test 4: >>> b = Bitarray(buf="hello") >>> assert len(b) == 40 >>> print b 0001011010100110001101100011011011110110 >>> b.bytes array('B', [104, 101, 108, 108, 111]) >>> b Bitarray(nbits=40, buf='hello') __init__ test 5: >>> b = Bitarray(nbits=10, buf="hello") >>> assert len(b) == 10 >>> print b 0001011010 >>> b Bitarray(nbits=10, buf='he') __init__ test 6: >>> b = Bitarray(nbits=50, buf="hello") >>> assert len(b) == 50 >>> print b 00010110101001100011011000110110111101100000000000 >>> b Bitarray(nbits=50, buf='hello\\x00\\x00') __init__ test 7: >>> b = Bitarray(nbits=40, buf="hello") >>> assert len(b) == 40 >>> print b 0001011010100110001101100011011011110110 >>> b Bitarray(nbits=40, buf='hello') __init__ test 8: >>> b = Bitarray(nbits=-10, buf="hello") Traceback (most recent call last): ... ValueError: Bitarray: nbits can't be < 0 __init__ test 9: >>> b = Bitarray(nbits=-10) Traceback (most recent call last): ... ValueError: Bitarray: nbits can't be < 0 __init__ test 10: >>> b = Bitarray(nbits=0, buf=iter(range(5)) ) >>> b Bitarray(nbits=0, buf='') __init__ test 11: >>> b = Bitarray(nbits=20, buf=iter(range(5)) ) >>> b Bitarray(nbits=20, buf='\\x00\\x01\\x02') __init__ test 12: >>> b = Bitarray(nbits=40, buf=iter(range(5)) ) >>> b Bitarray(nbits=40, buf='\\x00\\x01\\x02\\x03\\x04') __init__ test 12: >>> b = Bitarray(nbits=7*8, buf=iter(range(5)) ) >>> b Bitarray(nbits=56, buf='\\x00\\x01\\x02\\x03\\x04\\x00\\x00') """ if nbits is None: if buf is None: # First case, creates an empty Bitarray. self.bytes = array('B') self.nbits = 0 else: # Second case, creates a Bitarray from the input string/sequence, very fast # Input sequence can be an array of unsigned short ints too. self.bytes = array('B', buf) self.nbits = len(self.bytes) * 8 else: if nbits < 0: raise ValueError("Bitarray: nbits can't be < 0") self.nbits = nbits # nbytes is a number of bytes enough to contain all the requested nbits. nbytes = int(ceil(nbits / 8.0)) if buf is None: # This is the faster way to initialize it. self.bytes = array('B', [0]) * nbytes else: # If nbits is specified if isinstance(buf, str): # Manages the case with an input string because it's common and # it can be done quickly. len_buf = len(buf) if len_buf == nbytes: # The given string has a len compabible with the requires nbits, # so no conversion is needed. self.bytes = array('B', buf) elif len_buf > nbytes: # The given string is longer than the required number of bits, # so we take only part of the string. # This slicing may require a lot of memory, but it's fast. self.bytes = array('B', buf[:nbytes]) # Slow, but needs little memory: # self.bytes = array('B', imap(ord, islice(buf, 0, nbytes)) ) else: # The given string is shorter than the required number of bits, # so we create the array from the string, and then we append # some 0 bytes. self.bytes = array('B', buf) self.bytes.extend(repeat(0, nbytes-len_buf)) else: # The given buf isn't a string. The following line isn't fast, but # it manages all lens of nbits compared to the len of input sequence. bytes = islice(chain(buf, repeat(0)), nbytes) self.bytes = array('B', bytes) def frombits(self, bits): """frombits(bits): creates a new bit array from the given sequence of bits. Bits can be 0, 1, '0', '1', False, True; it doesn't apply bool() to items. Whitespace (spaces, newlines, etc) is ignored. Frombits test 1: >>> b = Bitarray() >>> b.frombits('''00100\t000011 ... 101011001''') >>> print b 00100000011101011001 Frombits test 2: >>> b = Bitarray() >>> b.frombits(map(int, '00100000011101011001')) >>> print b 00100000011101011001 Frombits test 3: >>> b = Bitarray() >>> b.frombits("000100101001010x") Traceback (most recent call last): ... ValueError: Bitarray: bitbuf contains a char different from 0,1,'0','1' and whitespace. Frombits test 4: >>> b = Bitarray(nbits=10, buf='\\x00\\x03') >>> b.frombits('1111 1111 11') >>> print b 1111111111 """ # This method is slow, if necessary it can be speed up in some way self.bytes = array('B') self.nbits = 0 nbits = 0 for bit in bits: bit = bit_conv.get(bit, -1) if bit is None: # Spaces, newlines, etc. are ignored. continue if bit == -1: raise ValueError("Bitarray: bitbuf contains a char different " "from 0,1,'0','1' and whitespace.") self.append(bit) # This isn't fast nbits += 1 self.nbits = nbits def __eq__(self, other): """ __eq__ test 1: >>> b1 = Bitarray() >>> b1.frombits("010001") >>> b2 = Bitarray() >>> b2.frombits("010001") >>> print b2 010001 >>> b2 Bitarray(nbits=6, buf='"') >>> b1 == b2 True __eq__ test 2: >>> b1 = Bitarray(nbits=6, buf='b') >>> print b1 010001 >>> b1 Bitarray(nbits=6, buf='b') >>> b2 = Bitarray(nbits=6, buf='"') >>> print b2 010001 >>> b2 Bitarray(nbits=6, buf='"') >>> b1 == b2 True __eq__ test 3: >>> b1 = Bitarray(nbits=1, buf=[93]) >>> print b1 1 >>> b2 = Bitarray(nbits=1, buf=[215]) >>> print b2 1 >>> b1 == b2 True __eq__ test 4: >>> b1 = Bitarray(nbits=10, buf=[20, 30]) >>> print b1 0010100001 >>> b2 = Bitarray(nbits=10, buf=[40, 30]) >>> print b2 0001010001 >>> b1 == b2 False """ if not isinstance(other, self.__class__): return False if not hasattr(other, "nbits"): return False if not hasattr(other, "bytes"): return False if self.nbits != other.nbits: return False nbits_mod8 = self.nbits % 8 if not nbits_mod8: # In this situation all bits of the byte array are really used by the # Bitarray, so you can just compare those arrays. return self.bytes == other.bytes else: # Then some bits of the array aren't used, and they must be ignored. # Save the last byte of self. last_self = self.bytes[-1] last_other = other.bytes[-1] # Make equal the not important bits of the last byte of self to the # not important bits of the last byte of other. """ # The original code to compute the slicing and dicing of last_self and last_other # Note that the less significant bits are used first by the bitarray to2 = lambda b: "".join(_nibbles[nibble] for nibble in "%02X" % b) # output # x = int("0101011",2) # input left = nbits_mod8 firstbits = lambda x, n: (x >> n) << n lastbits = lambda x, n: x & ( (1 << n) - 1 ) self.bytes[-1] = firstbits(last_other, left) | lastbits(last_self, left) """ # Takes the nbits_mod8 more significan bits of last_other firstbits = (last_other >> nbits_mod8) << nbits_mod8 # Resets the nbits_mod8 more significan bits of last_self # To do it creates a mask of many 1s starting from the less significant ones # then does an AND with last_self lastbits = last_self & ( (1 << nbits_mod8) - 1 ) # merges the two parts self.bytes[-1] = firstbits | lastbits # Compare bytes of self with those of other. comparison = self.bytes == other.bytes # This restores the original byte inside self, to be sure self.bytes[-1] = last_self return comparison def hamming(self, other): """hamming(other): return the Hamming distance between two Bitarray with the same len; that is the number of different bits. If 'other' isn't a Bitarray or its len is different, it raises a TypeError. >>> b1 = Bitarray() >>> b1.hamming(1) Traceback (most recent call last): ... TypeError: Bitarray.hamming: the argument must be a Bitarray. >>> b2 = Bitarray(10) >>> b1.hamming(b2) Traceback (most recent call last): ... TypeError: Bitarray.hamming: expected the same number of bits. hamming test 1: >>> b1 = Bitarray() >>> b2 = Bitarray() >>> b1.hamming(b2) 0 hamming test 2: >>> b1 = Bitarray(8) >>> b2 = Bitarray(8) >>> b1.hamming(b2) 0 Function to speed up hamming tests: >>> def dist(bs1, bs2): ... b1 = Bitarray() ... b1.frombits(bs1.replace("_", "")) ... b2 = Bitarray() ... b2.frombits(bs2.replace("_", "")) ... return b1.hamming(b2) hamming test 3: >>> dist("1010_1101", "1010_1101") 0 >>> dist("", "") 0 >>> dist("1010_1101", "1110_1101") 1 >>> dist("1010_1101", "0101_0010") 8 >>> dist("1010_1101"*10, "0101_0010"*10) 80 >>> dist("1"*128, "0"*128) 128 >>> dist("0000_0000_11", "0000_0000_11") 0 >>> dist("0000_0000_11", "0000_0000_00") 2 >>> dist("0000_0000_110", "0000_0000_001") 3 >>> dist("0"*111, "1"*111) 111 hamming test 4: >>> b1 = Bitarray(nbits=6, buf='b') >>> print b1 010001 >>> b2 = Bitarray(nbits=6, buf='"') >>> print b2 010001 >>> b1.hamming(b2) 0 hamming test 5: >>> b1 = Bitarray(nbits=14, buf='0b') >>> print b1 00001100010001 >>> b2 = Bitarray(nbits=14, buf='0"') >>> print b2 00001100010001 >>> b1.hamming(b2) 0 """ if not isinstance(other, self.__class__) or not hasattr(other, "nbits") or \ not hasattr(other, "bytes"): raise TypeError("Bitarray.hamming: the argument must be a Bitarray.") if self.nbits != other.nbits: raise TypeError("Bitarray.hamming: expected the same number of bits.") # Count the bits of the xor of self an other bytearrays different = sum(_setbits[sb ^ ob] for sb, ob in izip(self.bytes, other.bytes)) nbits_mod8 = self.nbits % 8 if nbits_mod8: # In this situation there are some bits at the end that must be ignored # Note that the less significant bits are used first by the Bitarray """ # Original code: to2 = lambda b: "".join(_nibbles[nibble] for nibble in "%02X" % b) # output firstbits = lambda x, n: (x >> n) << n last_self = self.bytes[-1] last_other = other.bytes[-1] sb = firstbits(last_self, nbits_mod8) ob = firstbits(last_other, nbits_mod8) """ sb = (self.bytes[-1] >> nbits_mod8) << nbits_mod8 ob = (other.bytes[-1] >> nbits_mod8) << nbits_mod8 different -= _setbits[sb ^ ob] return different def __getitem__(self, i): if i >= self.nbits or i < 0: raise IndexError("Bitarray index out of range") byte, offset = divmod(i, 8) mask = 1 << offset if self.bytes[byte] & mask: return 1 else: return 0 def __setitem__(self, i, val): if i >= self.nbits or i < 0: raise IndexError("Bitarray index out of range") byte, offset = divmod(i, 8) mask = 1 << offset if val: self.bytes[byte] |= mask else: self.bytes[byte] &= ~mask def __iter__(self): if self.nbits == len(self.bytes) * 8: # This is a simple situation, all bits of the array have to be yielded for byte in self.bytes: for bit in _bytes2ibits[byte]: yield bit else: # Last bits of array don't have to be yielded. # This code looks a bit slow, do you know how to speed it up? nbytes = len(self.bytes) # First yield all the bits, but the last byte for byte in islice(self.bytes, 0, self.nbits // 8): for bit in _bytes2ibits[byte]: yield bit # Then yield only the really necessary bits of the last byte. # A very basic algorithm is used, using __getitem__, but at the end # there are only 1-7 bits, so this takes short time. for pos in xrange((self.nbits >> 3) << 3, self.nbits): yield self.__getitem__(pos) def append(self, bit): """append(bit): append 1 if bit is True or 0 if it's False. Append test: >>> be = Bitarray() >>> for i in xrange(100): ... be.append(1) ... assert be.nbits == i+1 ... assert int(ceil(be.nbits / 8.0)) == len(be.bytes) """ if not (self.nbits % 8): # If nbits is divisible by 8, then all bits are used, # then we need a new byte at the end. self.bytes.append(0) self.nbits += 1 self.__setitem__(self.nbits-1, bit) def __str__(self): "Print a string of 0/1 digits, that represents the Bitarray." # Slow method, but usually the print isn't used where speed is important return "".join(imap(str, self)) def __repr__(self): "Print a representation of the Bitarray that can be given to eval too." return 'Bitarray(nbits=%d, buf=%r)' % (self.nbits, self.bytes.tostring()) def toint(self): """toint(): return the base 10 integer representation of the bits. Note that this isn't reversible, because the leading zeros aren't present.""" return int(self.__str__()[::-1], 2) def __len__(self): "Return the number of bits of the Bitarray (the attribute nbits is the same)." return self.nbits def __getstate__(self): "For pickle, for the save." return self.nbits, self.bytes.tostring() def __setstate__(self, (nbits, text)): "For pickle, for the load." self.bytes = array('B', text) self.nbits = nbits def set(self): "set: sets all the bits of the Bitarray to 1." for pos in xrange(len(self.bytes)): self.bytes[pos] = 255 def reset(self): "reset: resets all the bits of the Bitarray to 0." for pos in xrange(len(self.bytes)): self.bytes[pos] = 0 __all__ = ["Bitarray"] if __name__ == "__main__": import doctest doctest.testmod() print "Doctests finished."