Difference between revisions of "Lzw.h"

From CDOT Wiki
Jump to: navigation, search
(Created page with '<pre> #ifndef UPRIGHT_LZW_H #define UPRIGHT_LZW_H /* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52 * -----------------------------------------------------------------------…')
 
(No difference)

Latest revision as of 15:55, 20 February 2013


#ifndef UPRIGHT_LZW_H

#define UPRIGHT_LZW_H





/* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52

 * ----------------------------------------------------------------------------------

 *

 * Plain C++ port of LZW compression algorithm and code originally (c) Mark R. Nelson

 * http://marknelson.us/1989/10/01/lzw-data-compression

 * Variable bit length encoding idea and code originally by Michael Dipperstein

 * http://michael.dipperstein.com/lzw

 *

 * There are a lot of compression classes floating around but most are based on the

 * zlib (zip/unzip) library, which is good but a bit of overkill for simple and small

 * code. LZW combines decent compression ratios with very small code footprint. If

 * you need something more powerful here are a few resources:

 *

 *      http://www.codeproject.com/KB/files/zip_utils.aspx

 *      http://www.codeproject.com/KB/cpp/xzipunzip.aspx

 *      http://www.codeproject.com/KB/cpp/ChauMemzip.aspx

 *

 * Microsoft types can check the CAB protocol that is available in all windows:

 *      http://www.codeproject.com/KB/files/CABCompressExtract.aspx

 *      http://msdn.microsoft.com/en-us/library/bb417343.aspx

 *

 */



#include <stdio.h>

#include <stdlib.h>

#include <limits.h>

#include <string.h>

#include <assert.h>

#define ATLASSERT assert



/* NOTE: function and variable names left as much as possible matching the original

	LZW.c by Mark, naturally bundled in classes to get rid of static/globals etc

*/



#define MIN_CODE_LEN    9                   /* min # bits in a code word */

#define MAX_CODE_LEN    20                  /* max # bits in a code word */

#define CURRENT_MAX_CODES(x)     (1UL << (x))



#define FIRST_CODE      (1 << CHAR_BIT)     /* value of 1st string code */



#if (MIN_CODE_LEN <= CHAR_BIT)

#error Code words must be larger than 1 character

#endif



#if (MAX_CODE_LEN >= 25)

#error Code words must fit in an integer

#endif





/* VARIABLE BIT LENGTH ENCODING

 * Instead of using a fixed number of bits for code words, we start at 9 (=MIN_CODE_LEN)

 * and go up to BITS (<=MAX_CODE_LEN) so that small files are tightly packed and larger

 * files are fine too. The BITS constant determines the maximum hash table size. For 18

 * this means 250KB runtime table size which is enough for files ~4MB.

 * There is no problem for files larger than that; if we run out of table space for new

 * codes then the same codes are emitted (uncompressed obviously)

 */



#define BITS 17                   /* Setting the number of bits to 12, 13*/

#define HASHING_SHIFT (BITS-8)    /* or 14 affects several constants.    */

#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to   */

#define MAX_CODE MAX_VALUE - 1    /* compile their code in large model if*/

				  /* 14 bits are selected.               */



#if BITS == 20

	#define TABLE_SIZE 1048583

#elif BITS == 19

	#define TABLE_SIZE 524309

#elif BITS == 18

	#define TABLE_SIZE 262147

#elif BITS == 17

	#define TABLE_SIZE 131101

#elif BITS == 16

	#define TABLE_SIZE 65543

#elif BITS == 15

	#define TABLE_SIZE 32797

#elif BITS == 14

  #define TABLE_SIZE 18041        /* The string table size needs to be a */

				  /* prime number that is somewhat larger*/

#elif BITS == 13                  /* than 2**BITS.                       */

  #define TABLE_SIZE 9029

#elif BITS == 12

  #define TABLE_SIZE 5021

#else

#error define smaller or bigger table sizes

#endif



#if (TABLE_SIZE <= MAX_VALUE)

#error your prime numbers need attention

#endif



#if (BITS > MAX_CODE_LEN)

#error BITS can only go up to a maximum

#endif





/*

 This class does most of the job, except reading source and writing the compressed data

 A derived class does that so that there's flexibility to read either from files or memory

 */



class CLZWImpl {

protected:

int *code_value;                  /* This is the code value array        */

unsigned int *prefix_code;        /* This array holds the prefix codes   */

unsigned char *append_character;  /* This array holds the appended chars */

unsigned char decode_stack[4000]; /* This array holds the decoded string */

unsigned char CUR_BITS;           /* ~nab: added for variable bit size */

/* we are processing bits but in the end of the day we do I/O in bytes */

int input_bit_count, output_bit_count;

unsigned long input_bit_buffer, output_bit_buffer;



public:

CLZWImpl() {

	code_value = 0;

	prefix_code = 0;

	append_character = 0;

}



~CLZWImpl() {

	if(code_value)

		free(code_value);

	if(prefix_code)

		free(prefix_code);

	if(append_character)

		free(append_character);

}



int get_bits() { return CUR_BITS; }



protected:

int Init() {

	ATLASSERT(!code_value); /* call just once */



	code_value=(int*)malloc(TABLE_SIZE*sizeof(int));

	prefix_code=(unsigned int*)malloc(TABLE_SIZE*sizeof(unsigned int));

	append_character=(unsigned char*)malloc(TABLE_SIZE*sizeof(unsigned char));



	return code_value != 0 && prefix_code != 0 && append_character != 0;

}



/* override these 4: read a byte from source */

virtual int getc_src() = 0;

/* read a byte from compressed source (during expansion) and write to compressed output */

virtual int getc_comp() = 0;

/* write a byte to compressed output */

virtual int putc_comp(int ch) = 0;

/* write a byte to expanded output */

virtual int putc_out(int ch) = 0;



/*

** This is the compression routine.  The code should be a fairly close

** match to the algorithm accompanying the article.

**

*/



void compress()

{

unsigned int next_code;

unsigned int character;

unsigned int string_code;

unsigned int index;

unsigned int bit_limit;

int i;



	ATLASSERT(code_value); /* initialized? */



	CUR_BITS = MIN_CODE_LEN;

	bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;

	output_bit_count=0;

	output_bit_buffer=0L;



	ATLASSERT(256==FIRST_CODE);

	next_code=FIRST_CODE;       /* Next code is the next available string code*/

	for (i=0;i<TABLE_SIZE;i++)  /* Clear out the string table before starting */

		code_value[i]=-1;



  string_code=getc_src();      /* Get the first code                         */

  if(-1 == string_code)

	  return; /* empty file or error */



/*

** This is the main loop where it all happens.  This loop runs util all of

** the input has been exhausted.  Note that it stops adding codes to the

** table after all of the possible codes have been defined.

*/

  while ((character=getc_src()) != -1)

  {

    index=find_match(string_code,character);/* See if the string is in */

    if (code_value[index] != -1)            /* the table.  If it is,   */

      string_code=code_value[index];        /* get the code value.  If */

    else                                    /* the string is not in the*/

    {                                       /* table, try to add it.   */

      if (next_code <= MAX_CODE)

      {

	code_value[index]=next_code++;

	prefix_code[index]=string_code;

	append_character[index]=character;

      }



		/* are we using enough bits to write out this code word? */

		if(string_code >= bit_limit && CUR_BITS < BITS)

		{

			/* mark need for bigger code word with all ones */

			output_code(bit_limit);

			CUR_BITS++;

			bit_limit = (CURRENT_MAX_CODES(CUR_BITS) - 1);

		}



		ATLASSERT(string_code < bit_limit);



      output_code(string_code);  /* When a string is found  */

      string_code=character;            /* that is not in the table*/

    }                                   /* I output the last string*/

  }                                     /* after adding the new one*/

/*

** End of the main loop.

*/



  output_code(string_code); /* Output the last code               */

  output_code(-1);          /* This code flushes the output buffer*/

}



/*

** This is the hashing routine.  It tries to find a match for the prefix+char

** string in the string table.  If it finds it, the index is returned.  If

** the string is not found, the first available index in the string table is

** returned instead.

*/

int find_match(unsigned int hash_prefix,unsigned int hash_character)

{

int index;

int offset;



  index = (hash_character << HASHING_SHIFT) ^ hash_prefix;

  if (index == 0)

    offset = 1;

  else

    offset = TABLE_SIZE - index;

  while (1)

  {

    if (code_value[index] == -1)

      return(index);

    if (prefix_code[index] == hash_prefix &&

	append_character[index] == hash_character)

      return(index);

    index -= offset;

    if (index < 0)

      index += TABLE_SIZE;

  }

}



/*

**  This is the expansion routine.  It takes an LZW format file, and expands

**  it to an output file.  The code here should be a fairly close match to

**  the algorithm in the accompanying article.

*/



void expand()

{

unsigned int next_code;

unsigned int new_code;

unsigned int old_code;

int character;

unsigned char *string;

unsigned int bit_limit;



	ATLASSERT(code_value); /* initialized? */



	CUR_BITS = MIN_CODE_LEN;

	bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;

	input_bit_count=0;

	input_bit_buffer=0L;



// @@@ what if we pass uncompressed file to decode?



  next_code=FIRST_CODE;        /* This is the next available code to define */



  old_code=input_code();       /* Read in the first code, initialize the */

  if(-1 == old_code)

	  return; /* read error? */

  character=old_code;          /* character variable, and send the first */

  if(putc_out(old_code)==-1)   /* code to the output file                */

	  return; /* write error */

/*

**  This is the main expansion loop.  It reads in characters from the LZW file

**  until it sees the special code used to inidicate the end of the data.

*/

  while ((new_code=input_code()) != (-1))

  {

	  /* look for code length increase marker */

	  if(bit_limit == new_code && CUR_BITS < BITS)

	  {

		  CUR_BITS++;

		  bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;



		  new_code=input_code();

		  ATLASSERT(new_code != -1); /* must be read error? */

		  if(new_code == -1)

			  break;

	  }



		ATLASSERT(new_code < bit_limit);



/*

** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING

** case which generates an undefined code.  It handles it by decoding

** the last code, and adding a single character to the end of the decode string.

*/

    if (new_code>=next_code)

    {

      *decode_stack=character;

      string=decode_string(decode_stack+1,old_code);

    }

/*

** Otherwise we do a straight decode of the new code.

*/

    else

      string=decode_string(decode_stack,new_code);

/*

** Now we output the decoded string in reverse order.

*/

    character=*string;

    while (string >= decode_stack)

      putc_out(*string--);

/*

** Finally, if possible, add a new code to the string table.

*/

    if (next_code <= MAX_CODE)

    {

      prefix_code[next_code]=old_code;

      append_character[next_code]=character;

      next_code++;

    }

    old_code=new_code;

  }

}



/*

** This routine simply decodes a string from the string table, storing

** it in a buffer.  The buffer can then be output in reverse order by

** the expansion program.

*/

/* ~nab: these char* aren't a risk for unicode; we are reading bytes */

unsigned char *decode_string(unsigned char *buffer,unsigned int code)

{

int i;



  i=0;

  while (code >= FIRST_CODE)

  {

    *buffer++ = append_character[code];

    code=prefix_code[code];

	 i++;

	 ATLASSERT(i < sizeof(decode_stack)); /* buffer overrun if it blows, increase stack size! */

  }

  *buffer=code;

  return(buffer);

}



/*

** The following two routines are used to output variable length

** codes.  They are written strictly for clarity, and are not

** particularyl efficient.



  ~nab: there's room for improvement in these I/O functions eg work in DWORDS instead of bytes

*/



unsigned int input_code()

{

int c;

unsigned int return_value;

//static int input_bit_count=0;

//static unsigned long input_bit_buffer=0L;



  while (input_bit_count <= 24)

  {

		if ((c = getc_comp()) == -1)

			break;



    input_bit_buffer |=

	(unsigned long) c << (24-input_bit_count);

    input_bit_count += 8;

  }



  if(input_bit_count < CUR_BITS) {

	  ATLASSERT(!input_bit_buffer);

		return -1; /* EOF */

	}



  return_value=input_bit_buffer >> (32-CUR_BITS);

  input_bit_buffer <<= CUR_BITS;

  input_bit_count -= CUR_BITS;



  ATLASSERT(return_value < (1UL << CUR_BITS));

  return(return_value);

}



/* bits are written outside normal byte boundaries, hence the need for keeping old values */

void output_code(unsigned int code)

{

//static int output_bit_count=0;

//static unsigned long output_bit_buffer=0L;



	ATLASSERT(output_bit_count < 8); /* leftovers */

	ATLASSERT(CUR_BITS + output_bit_count <= 32);

	/*codes <256 are possible for single characters, zero bytes etc*/



	if(-1 == code) {

		/* pad remaining zeros and flush the last byte */

		if(output_bit_count) {

			output_bit_buffer >>= 24;

			ATLASSERT((output_bit_buffer & 0xFF) == output_bit_buffer);

			putc_comp(output_bit_buffer);



			output_bit_count = 0;

			output_bit_buffer = 0; /* in case some eejit calls us again */

		}



		return;

	}



	ATLASSERT(code < (1UL << CUR_BITS));



	/* sends new bytes near the top (MSB) */

  output_bit_buffer |= (unsigned long) code << (32-CUR_BITS-output_bit_count);

  output_bit_count += CUR_BITS;

  while (output_bit_count >= 8)

  {

	 /* no check for error but if there was a problem we'd know from the time we wrote the identifier */

    putc_comp(output_bit_buffer >> 24);

    output_bit_buffer <<= 8;

    output_bit_count -= 8;

  }

}

}; /* CLZWImpl */



/* example derived class using C buffered I/O functions */

class CLZWCompressFile : public CLZWImpl {

public:

	CLZWCompressFile() {

		io_file = 0;

		lzw_file = 0;

	};



	~CLZWCompressFile() {

		ATLASSERT(!io_file);

		ATLASSERT(!lzw_file);

	};



	int AnyIOErrors() {return io_error; }



	// @@@ these char* should be changed for unicode builds

	unsigned int Compress(char* input_file_name, char* to_name)

	{

		ATLASSERT(input_file_name && *input_file_name);

		ATLASSERT(to_name && *to_name);

		ATLASSERT(strcmp(to_name, input_file_name));



		io_error = 1;



		if(!code_value)

			if(!Init())

				return 0; /* rare memory error */



		u_comp = 0;

		u_io = 0;

		io_file=fopen(input_file_name,"rb");

		if(io_file) {

			lzw_file=fopen(to_name,"wb");

			if(lzw_file) {

				/* write LZW identifier L+starting bytes */

				putc('L', lzw_file);

				if(putc(MIN_CODE_LEN, lzw_file) == MIN_CODE_LEN) {

					compress();

					io_error = ferror(lzw_file) || ferror(io_file);

					if(!io_error)

						ATLASSERT(u_comp <= u_io); /* this is bound to bomb every now and then, no compression! */

				}

				fclose(lzw_file);

				lzw_file = 0;

			}



			fclose(io_file);

			io_file = 0;

		}



		return u_comp;

	}



	unsigned int Expand(char* lzw_name, char* to_name)

	{

		ATLASSERT(lzw_name && *lzw_name);

		ATLASSERT(to_name && *to_name);

		ATLASSERT(strcmp(to_name, lzw_name));



		io_error = 1;



		if(!code_value)

			if(!Init())

				return 0; /* rare memory error */



		u_comp = 0;

		u_io = 0;

		lzw_file=fopen(lzw_name,"rb");

		if(lzw_file) {

			/* check LZW identifier L+starting bytes */

			int ch1 = getc(lzw_file);

			int ch2 = getc(lzw_file);

			if('L' == ch1 && MIN_CODE_LEN==ch2) {

				io_file=fopen(to_name,"wb");

				if(io_file) {

					expand();

					io_error = ferror(lzw_file) || ferror(io_file);



					fclose(io_file);

					io_file = 0;

				}

			}



			fclose(lzw_file);

			lzw_file = 0;

		}



		return u_io;

	}



protected:

	/* -1 return indicates either EOF or some IO error */

	virtual int getc_src() {

		ATLASSERT(io_file);

		int ch = getc(io_file);

		if(EOF == ch)

			return -1;



		u_io++;

		return ch;

	}

	virtual int getc_comp() {

		ATLASSERT(lzw_file);

		int ch = getc(lzw_file);

		if(EOF == ch)

			return -1;



		u_comp++;

		return ch;

	}

	virtual int putc_comp(int ch) {

		ATLASSERT(lzw_file);

		ATLASSERT(ch >= 0 && ch < 256);

		int ret = putc(ch, lzw_file);



		if(ret != EOF) {

			ATLASSERT(ret == ch);

			u_comp++;

		}

		else

			ret = -1;



		return ret;

	}

	virtual int putc_out(int ch) {

		ATLASSERT(io_file);

		ATLASSERT(ch >= 0 && ch < 256);

		int ret = putc(ch, io_file);



		if(ret != EOF)

			u_io++;

		else

			ret = -1;



		return ret;

	}



	FILE* io_file;

	FILE *lzw_file;

	int io_error;

public:

	unsigned long u_io, u_comp; /* bytes read and written */

};



// @@@ could have a generic one on IStream, CreateStreamOnHGlobal/SHCreateStreamOnFile



#endif /* UPRIGHT_LZW_H */