Changes

DPS915/M-N-M

16,715 bytes removed, 15:40, 20 February 2013

→‎lzw.h

[[Media:LZW.zip]]

~~=== lzw.h ===~~

~~<pre>~~

~~#ifndef UPRIGHT_LZW_H~~

~~#define UPRIGHT_LZW_H~~

~~/* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52~~

* ----------------------------------------------------------------------------------

*

* Plain C++ port of LZW compression algorithm and code originally (c) Mark R. Nelson

* http://marknelson.us/1989/10/01/lzw-data-compression

* Variable bit length encoding idea and code originally by Michael Dipperstein

* http://michael.dipperstein.com/lzw

*

* There are a lot of compression classes floating around but most are based on the

* zlib (zip/unzip) library, which is good but a bit of overkill for simple and small

* code. LZW combines decent compression ratios with very small code footprint. If

* you need something more powerful here are a few resources:

*

* http://www.codeproject.com/KB/files/zip_utils.aspx

* http://www.codeproject.com/KB/cpp/xzipunzip.aspx

* http://www.codeproject.com/KB/cpp/ChauMemzip.aspx

*

* Microsoft types can check the CAB protocol that is available in all windows:

* http://www.codeproject.com/KB/files/CABCompressExtract.aspx

* http://msdn.microsoft.com/en-us/library/bb417343.aspx

*

*/

~~#include <stdio.h>~~

~~#include <stdlib.h>~~

~~#include <limits.h>~~

~~#include <string.h>~~

~~#include <assert.h>~~

~~#define ATLASSERT assert~~

~~/* NOTE: function and variable names left as much as possible matching the original~~

~~LZW.c by Mark, naturally bundled in classes to get rid of static/globals etc~~

*/

~~#define MIN_CODE_LEN 9 /* min # bits in a code word */~~

~~#define MAX_CODE_LEN 20 /* max # bits in a code word */~~

~~#define CURRENT_MAX_CODES(x) (1UL << (x))~~

~~#define FIRST_CODE (1 << CHAR_BIT) /* value of 1st string code */~~

~~#if (MIN_CODE_LEN <= CHAR_BIT)~~

~~#error Code words must be larger than 1 character~~

~~#endif~~

~~#if (MAX_CODE_LEN >= 25)~~

~~#error Code words must fit in an integer~~

~~#endif~~

~~/* VARIABLE BIT LENGTH ENCODING~~

* Instead of using a fixed number of bits for code words, we start at 9 (=MIN_CODE_LEN)

* and go up to BITS (<=MAX_CODE_LEN) so that small files are tightly packed and larger

* files are fine too. The BITS constant determines the maximum hash table size. For 18

* this means 250KB runtime table size which is enough for files ~4MB.

* There is no problem for files larger than that; if we run out of table space for new

* codes then the same codes are emitted (uncompressed obviously)

*/

~~#define BITS 17 /* Setting the number of bits to 12, 13*/~~

~~#define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */~~

~~#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */~~

~~#define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/~~

~~/* 14 bits are selected. */~~

~~#if BITS == 20~~

~~#define TABLE_SIZE 1048583~~

~~#elif BITS == 19~~

~~#define TABLE_SIZE 524309~~

~~#elif BITS == 18~~

~~#define TABLE_SIZE 262147~~

~~#elif BITS == 17~~

~~#define TABLE_SIZE 131101~~

~~#elif BITS == 16~~

~~#define TABLE_SIZE 65543~~

~~#elif BITS == 15~~

~~#define TABLE_SIZE 32797~~

~~#elif BITS == 14~~

~~#define TABLE_SIZE 18041 /* The string table size needs to be a */~~

~~/* prime number that is somewhat larger*/~~

~~#elif BITS == 13 /* than 2**BITS. */~~

~~#define TABLE_SIZE 9029~~

~~#elif BITS == 12~~

~~#define TABLE_SIZE 5021~~

~~#else~~

~~#error define smaller or bigger table sizes~~

~~#endif~~

~~#if (TABLE_SIZE <= MAX_VALUE)~~

~~#error your prime numbers need attention~~

~~#endif~~

~~#if (BITS > MAX_CODE_LEN)~~

~~#error BITS can only go up to a maximum~~

~~#endif~~

/*

~~This class does most of the job, except reading source and writing the compressed data~~

~~A derived class does that so that there's flexibility to read either from files or memory~~

*/

~~class CLZWImpl {~~

~~protected:~~

~~int *code_value; /* This is the code value array */~~

~~unsigned int *prefix_code; /* This array holds the prefix codes */~~

~~unsigned char *append_character; /* This array holds the appended chars */~~

~~unsigned char decode_stack[4000]; /* This array holds the decoded string */~~

~~unsigned char CUR_BITS; /* ~nab: added for variable bit size */~~

~~/* we are processing bits but in the end of the day we do I/O in bytes */~~

~~int input_bit_count, output_bit_count;~~

~~unsigned long input_bit_buffer, output_bit_buffer;~~

~~public:~~

~~CLZWImpl() {~~

~~code_value = 0;~~

~~prefix_code = 0;~~

~~append_character = 0;~~

}

~~~CLZWImpl() {~~

~~if(code_value)~~

~~free(code_value);~~

~~if(prefix_code)~~

~~free(prefix_code);~~

~~if(append_character)~~

~~free(append_character);~~

}

~~int get_bits() { return CUR_BITS; }~~

~~protected:~~

~~int Init() {~~

~~ATLASSERT(!code_value); /* call just once */~~

~~code_value=(int*)malloc(TABLE_SIZE*sizeof(int));~~

~~prefix_code=(unsigned int*)malloc(TABLE_SIZE*sizeof(unsigned int));~~

~~append_character=(unsigned char*)malloc(TABLE_SIZE*sizeof(unsigned char));~~

~~return code_value != 0 && prefix_code != 0 && append_character != 0;~~

}

~~/* override these 4: read a byte from source */~~

~~virtual int getc_src() = 0;~~

~~/* read a byte from compressed source (during expansion) and write to compressed output */~~

~~virtual int getc_comp() = 0;~~

~~/* write a byte to compressed output */~~

~~virtual int putc_comp(int ch) = 0;~~

~~/* write a byte to expanded output */~~

~~virtual int putc_out(int ch) = 0;~~

/*

** This is the compression routine. The code should be a fairly close

** match to the algorithm accompanying the article.

**

*/

~~void compress()~~

{

~~unsigned int next_code;~~

~~unsigned int character;~~

~~unsigned int string_code;~~

~~unsigned int index;~~

~~unsigned int bit_limit;~~

~~int i;~~

~~ATLASSERT(code_value); /* initialized? */~~

~~CUR_BITS = MIN_CODE_LEN;~~

~~bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;~~

~~output_bit_count=0;~~

~~output_bit_buffer=0L;~~

~~ATLASSERT(256==FIRST_CODE);~~

~~next_code=FIRST_CODE; /* Next code is the next available string code*/~~

~~for (i=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */~~

~~code_value[i]=-1;~~

~~string_code=getc_src(); /* Get the first code */~~

~~if(-1 == string_code)~~

~~return; /* empty file or error */~~

/*

** This is the main loop where it all happens. This loop runs util all of

** the input has been exhausted. Note that it stops adding codes to the

** table after all of the possible codes have been defined.

*/

~~while ((character=getc_src()) != -1)~~

{

~~index=find_match(string_code,character);/* See if the string is in */~~

~~if (code_value[index] != -1) /* the table. If it is, */~~

~~string_code=code_value[index]; /* get the code value. If */~~

~~else /* the string is not in the*/~~

~~{ /* table, try to add it. */~~

~~if (next_code <= MAX_CODE)~~

{

~~code_value[index]=next_code++;~~

~~prefix_code[index]=string_code;~~

~~append_character[index]=character;~~

}

~~/* are we using enough bits to write out this code word? */~~

~~if(string_code >= bit_limit && CUR_BITS < BITS)~~

{

~~/* mark need for bigger code word with all ones */~~

~~output_code(bit_limit);~~

~~CUR_BITS++;~~

~~bit_limit = (CURRENT_MAX_CODES(CUR_BITS) - 1);~~

}

~~ATLASSERT(string_code < bit_limit);~~

~~output_code(string_code); /* When a string is found */~~

~~string_code=character; /* that is not in the table*/~~

~~} /* I output the last string*/~~

~~} /* after adding the new one*/~~

/*

** End of the main loop.

*/

~~output_code(string_code); /* Output the last code */~~

~~output_code(-1); /* This code flushes the output buffer*/~~

}

/*

** This is the hashing routine. It tries to find a match for the prefix+char

** string in the string table. If it finds it, the index is returned. If

** the string is not found, the first available index in the string table is

** returned instead.

*/

~~int find_match(unsigned int hash_prefix,unsigned int hash_character)~~

{

~~int index;~~

~~int offset;~~

~~index = (hash_character << HASHING_SHIFT) ^ hash_prefix;~~

~~if (index == 0)~~

~~offset = 1;~~

~~else~~

~~offset = TABLE_SIZE - index;~~

~~while (1)~~

{

~~if (code_value[index] == -1)~~

~~return(index);~~

~~if (prefix_code[index] == hash_prefix &&~~

~~append_character[index] == hash_character)~~

~~return(index);~~

~~index -= offset;~~

~~if (index < 0)~~

~~index += TABLE_SIZE;~~

}

/*

** This is the expansion routine. It takes an LZW format file, and expands

** it to an output file. The code here should be a fairly close match to

** the algorithm in the accompanying article.

*/

~~void expand()~~

{

~~unsigned int next_code;~~

~~unsigned int new_code;~~

~~unsigned int old_code;~~

~~int character;~~

~~unsigned char *string;~~

~~unsigned int bit_limit;~~

~~ATLASSERT(code_value); /* initialized? */~~

~~CUR_BITS = MIN_CODE_LEN;~~

~~bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;~~

~~input_bit_count=0;~~

~~input_bit_buffer=0L;~~

~~// @@@ what if we pass uncompressed file to decode?~~

~~next_code=FIRST_CODE; /* This is the next available code to define */~~

~~old_code=input_code(); /* Read in the first code, initialize the */~~

~~if(-1 == old_code)~~

~~return; /* read error? */~~

~~character=old_code; /* character variable, and send the first */~~

~~if(putc_out(old_code)==-1) /* code to the output file */~~

~~return; /* write error */~~

/*

** This is the main expansion loop. It reads in characters from the LZW file

** until it sees the special code used to inidicate the end of the data.

*/

~~while ((new_code=input_code()) != (-1))~~

{

~~/* look for code length increase marker */~~

~~if(bit_limit == new_code && CUR_BITS < BITS)~~

{

~~CUR_BITS++;~~

~~bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;~~

~~new_code=input_code();~~

~~ATLASSERT(new_code != -1); /* must be read error? */~~

~~if(new_code == -1)~~

~~break;~~

}

~~ATLASSERT(new_code < bit_limit);~~

/*

** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING

** case which generates an undefined code. It handles it by decoding

** the last code, and adding a single character to the end of the decode string.

*/

~~if (new_code>=next_code)~~

{

*decode_stack=character;

~~string=decode_string(decode_stack+1,old_code);~~

}

/*

** Otherwise we do a straight decode of the new code.

*/

~~else~~

~~string=decode_string(decode_stack,new_code);~~

/*

** Now we output the decoded string in reverse order.

*/

~~character=*string;~~

~~while (string >= decode_stack)~~

~~putc_out(*string--);~~

/*

** Finally, if possible, add a new code to the string table.

*/

~~if (next_code <= MAX_CODE)~~

{

~~prefix_code[next_code]=old_code;~~

~~append_character[next_code]=character;~~

~~next_code++;~~

}

~~old_code=new_code;~~

}

/*

** This routine simply decodes a string from the string table, storing

** it in a buffer. The buffer can then be output in reverse order by

** the expansion program.

*/

~~/* ~nab: these char* aren't a risk for unicode; we are reading bytes */~~

~~unsigned char *decode_string(unsigned char *buffer,unsigned int code)~~

{

~~int i;~~

~~i=0;~~

~~while (code >= FIRST_CODE)~~

{

*buffer++ = append_character[code];

~~code=prefix_code[code];~~

~~i++;~~

~~ATLASSERT(i < sizeof(decode_stack)); /* buffer overrun if it blows, increase stack size! */~~

}

*buffer=code;

~~return(buffer);~~

}

/*

** The following two routines are used to output variable length

** codes. They are written strictly for clarity, and are not

** particularyl efficient.

~~~nab: there's room for improvement in these I/O functions eg work in DWORDS instead of bytes~~

*/

~~unsigned int input_code()~~

{

~~int c;~~

~~unsigned int return_value;~~

~~//static int input_bit_count=0;~~

~~//static unsigned long input_bit_buffer=0L;~~

~~while (input_bit_count <= 24)~~

{

~~if ((c = getc_comp()) == -1)~~

~~break;~~

~~input_bit_buffer |=~~

~~(unsigned long) c << (24-input_bit_count);~~

~~input_bit_count += 8;~~

}

~~if(input_bit_count < CUR_BITS) {~~

~~ATLASSERT(!input_bit_buffer);~~

~~return -1; /* EOF */~~

}

~~return_value=input_bit_buffer >> (32-CUR_BITS);~~

~~input_bit_buffer <<= CUR_BITS;~~

~~input_bit_count -= CUR_BITS;~~

~~ATLASSERT(return_value < (1UL << CUR_BITS));~~

~~return(return_value);~~

}

~~/* bits are written outside normal byte boundaries, hence the need for keeping old values */~~

~~void output_code(unsigned int code)~~

{

~~//static int output_bit_count=0;~~

~~//static unsigned long output_bit_buffer=0L;~~

~~ATLASSERT(output_bit_count < 8); /* leftovers */~~

~~ATLASSERT(CUR_BITS + output_bit_count <= 32);~~

~~/*codes <256 are possible for single characters, zero bytes etc*/~~

~~if(-1 == code) {~~

~~/* pad remaining zeros and flush the last byte */~~

~~if(output_bit_count) {~~

~~output_bit_buffer >>= 24;~~

~~ATLASSERT((output_bit_buffer & 0xFF) == output_bit_buffer);~~

~~putc_comp(output_bit_buffer);~~

~~output_bit_count = 0;~~

~~output_bit_buffer = 0; /* in case some eejit calls us again */~~

}

~~return;~~

}

~~ATLASSERT(code < (1UL << CUR_BITS));~~

~~/* sends new bytes near the top (MSB) */~~

~~output_bit_buffer |= (unsigned long) code << (32-CUR_BITS-output_bit_count);~~

~~output_bit_count += CUR_BITS;~~

~~while (output_bit_count >= 8)~~

{

~~/* no check for error but if there was a problem we'd know from the time we wrote the identifier */~~

~~putc_comp(output_bit_buffer >> 24);~~

~~output_bit_buffer <<= 8;~~

~~output_bit_count -= 8;~~

}

~~}; /* CLZWImpl */~~

~~/* example derived class using C buffered I/O functions */~~

~~class CLZWCompressFile : public CLZWImpl {~~

~~public:~~

~~CLZWCompressFile() {~~

~~io_file = 0;~~

~~lzw_file = 0;~~

};

~~~CLZWCompressFile() {~~

~~ATLASSERT(!io_file);~~

~~ATLASSERT(!lzw_file);~~

};

~~int AnyIOErrors() {return io_error; }~~

~~// @@@ these char* should be changed for unicode builds~~

~~unsigned int Compress(char* input_file_name, char* to_name)~~

{

~~ATLASSERT(input_file_name && *input_file_name);~~

~~ATLASSERT(to_name && *to_name);~~

~~ATLASSERT(strcmp(to_name, input_file_name));~~

~~io_error = 1;~~

~~if(!code_value)~~

~~if(!Init())~~

~~return 0; /* rare memory error */~~

~~u_comp = 0;~~

~~u_io = 0;~~

~~io_file=fopen(input_file_name,"rb");~~

~~if(io_file) {~~

~~lzw_file=fopen(to_name,"wb");~~

~~if(lzw_file) {~~

~~/* write LZW identifier L+starting bytes */~~

~~putc('L', lzw_file);~~

~~if(putc(MIN_CODE_LEN, lzw_file) == MIN_CODE_LEN) {~~

~~compress();~~

~~io_error = ferror(lzw_file) || ferror(io_file);~~

~~if(!io_error)~~

~~ATLASSERT(u_comp <= u_io); /* this is bound to bomb every now and then, no compression! */~~

}

~~fclose(lzw_file);~~

~~lzw_file = 0;~~

}

~~fclose(io_file);~~

~~io_file = 0;~~

}

~~return u_comp;~~

}

~~unsigned int Expand(char* lzw_name, char* to_name)~~

{

~~ATLASSERT(lzw_name && *lzw_name);~~

~~ATLASSERT(to_name && *to_name);~~

~~ATLASSERT(strcmp(to_name, lzw_name));~~

~~io_error = 1;~~

~~if(!code_value)~~

~~if(!Init())~~

~~return 0; /* rare memory error */~~

~~u_comp = 0;~~

~~u_io = 0;~~

~~lzw_file=fopen(lzw_name,"rb");~~

~~if(lzw_file) {~~

~~/* check LZW identifier L+starting bytes */~~

~~int ch1 = getc(lzw_file);~~

~~int ch2 = getc(lzw_file);~~

~~if('L' == ch1 && MIN_CODE_LEN==ch2) {~~

~~io_file=fopen(to_name,"wb");~~

~~if(io_file) {~~

~~expand();~~

~~io_error = ferror(lzw_file) || ferror(io_file);~~

~~fclose(io_file);~~

~~io_file = 0;~~

}

~~fclose(lzw_file);~~

~~lzw_file = 0;~~

}

~~return u_io;~~

}

~~protected:~~

~~/* -1 return indicates either EOF or some IO error */~~

~~virtual int getc_src() {~~

~~ATLASSERT(io_file);~~

~~int ch = getc(io_file);~~

~~if(EOF == ch)~~

~~return -1;~~

~~u_io++;~~

~~return ch;~~

}

~~virtual int getc_comp() {~~

~~ATLASSERT(lzw_file);~~

~~int ch = getc(lzw_file);~~

~~if(EOF == ch)~~

~~return -1;~~

~~u_comp++;~~

~~return ch;~~

}

~~virtual int putc_comp(int ch) {~~

~~ATLASSERT(lzw_file);~~

~~ATLASSERT(ch >= 0 && ch < 256);~~

~~int ret = putc(ch, lzw_file);~~

~~if(ret != EOF) {~~

~~ATLASSERT(ret == ch);~~

~~u_comp++;~~

}

~~else~~

~~ret = -1;~~

~~return ret;~~

}

~~virtual int putc_out(int ch) {~~

~~ATLASSERT(io_file);~~

~~ATLASSERT(ch >= 0 && ch < 256);~~

~~int ret = putc(ch, io_file);~~

~~if(ret != EOF)~~

~~u_io++;~~

~~else~~

~~ret = -1;~~

~~return ret;~~

}

~~FILE* io_file;~~

~~FILE *lzw_file;~~

~~int io_error;~~

~~public:~~

~~unsigned long u_io, u_comp; /* bytes read and written */~~

};

~~// @@@ could have a generic one on IStream, CreateStreamOnHGlobal/SHCreateStreamOnFile~~

~~#endif /* UPRIGHT_LZW_H */~~

~~</pre>~~

~~----~~

=== Assignment 2 ===

=== Assignment 3 ===

Nprakashpanicker

1

edit

CDOT Wiki β

Changes

DPS915/M-N-M

CDOT Wiki ^β