Changes

Jump to: navigation, search

DPS915/M-N-M

18,205 bytes added, 15:18, 20 February 2013
Assignment 1
0.21 48.45 0.10 7095561 14.09 14.09 CLZWCompressFile::putc_comp(int)
</pre>
 
=== lzw.cpp ===
 
<pre>
 
#include <time.h>
 
#include "lzw.h"
 
 
 
/********************************************************************
 
**
 
** This program gets a file name from the command line. It compresses the
 
** file, placing its output in a file named test.lzw. It then expands
 
** test.lzw into test.out. Test.out should then be an exact duplicate of
 
** the input file.
 
**
 
*************************************************************************/
 
 
 
main(int argc, char *argv[])
 
{
 
clock_t timer;
 
CLZWCompressFile lzw;
 
 
 
/*
 
** Get the file name, open it up, and open up the lzw output file.
 
*/
 
if (argc==1)
 
{
 
printf("Input file name to compress?\n");
 
return 0;
 
}
 
 
 
printf("testing %s...\n", argv[1]);
 
/*
 
** Compress the file.
 
*/
 
timer = clock();
 
int crunch = lzw.Compress(argv[1], "test.lzw");
 
timer = clock() - timer; //CLOCKS_PER_SEC
 
printf("compress time=%d ms, encoding=%d, size=%u", timer, lzw.get_bits(), crunch);
 
int filesize = lzw.u_io;
 
printf(" (ratio=%d%%)\n", filesize ? (filesize-crunch)*100/filesize : 0);
 
if(lzw.AnyIOErrors())
 
printf("***I/O ERROR***\n");
 
 
 
/*
 
** Expand the file.
 
*/
 
timer = clock();
 
int orig = lzw.Expand("test.lzw", "test.out");
 
timer = clock() - timer; //CLOCKS_PER_SEC
 
printf("expand time=%d ms, encoding=%d\n", timer, lzw.get_bits());
 
if(lzw.AnyIOErrors())
 
printf("***I/O ERROR***\n");
 
 
 
ATLASSERT(filesize == orig); // did we mangle the file?
 
return 0;
 
}
</pre>
 
=== lzw.h ===
 
<pre>
 
#ifndef UPRIGHT_LZW_H
 
#define UPRIGHT_LZW_H
 
 
 
 
 
/* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52
 
* ----------------------------------------------------------------------------------
 
*
 
* Plain C++ port of LZW compression algorithm and code originally (c) Mark R. Nelson
 
* http://marknelson.us/1989/10/01/lzw-data-compression
 
* Variable bit length encoding idea and code originally by Michael Dipperstein
 
* http://michael.dipperstein.com/lzw
 
*
 
* There are a lot of compression classes floating around but most are based on the
 
* zlib (zip/unzip) library, which is good but a bit of overkill for simple and small
 
* code. LZW combines decent compression ratios with very small code footprint. If
 
* you need something more powerful here are a few resources:
 
*
 
* http://www.codeproject.com/KB/files/zip_utils.aspx
 
* http://www.codeproject.com/KB/cpp/xzipunzip.aspx
 
* http://www.codeproject.com/KB/cpp/ChauMemzip.aspx
 
*
 
* Microsoft types can check the CAB protocol that is available in all windows:
 
* http://www.codeproject.com/KB/files/CABCompressExtract.aspx
 
* http://msdn.microsoft.com/en-us/library/bb417343.aspx
 
*
 
*/
 
 
 
#include <stdio.h>
 
#include <stdlib.h>
 
#include <limits.h>
 
#include <string.h>
 
#include <assert.h>
 
#define ATLASSERT assert
 
 
 
/* NOTE: function and variable names left as much as possible matching the original
 
LZW.c by Mark, naturally bundled in classes to get rid of static/globals etc
 
*/
 
 
 
#define MIN_CODE_LEN 9 /* min # bits in a code word */
 
#define MAX_CODE_LEN 20 /* max # bits in a code word */
 
#define CURRENT_MAX_CODES(x) (1UL << (x))
 
 
 
#define FIRST_CODE (1 << CHAR_BIT) /* value of 1st string code */
 
 
 
#if (MIN_CODE_LEN <= CHAR_BIT)
 
#error Code words must be larger than 1 character
 
#endif
 
 
 
#if (MAX_CODE_LEN >= 25)
 
#error Code words must fit in an integer
 
#endif
 
 
 
 
 
/* VARIABLE BIT LENGTH ENCODING
 
* Instead of using a fixed number of bits for code words, we start at 9 (=MIN_CODE_LEN)
 
* and go up to BITS (<=MAX_CODE_LEN) so that small files are tightly packed and larger
 
* files are fine too. The BITS constant determines the maximum hash table size. For 18
 
* this means 250KB runtime table size which is enough for files ~4MB.
 
* There is no problem for files larger than that; if we run out of table space for new
 
* codes then the same codes are emitted (uncompressed obviously)
 
*/
 
 
 
#define BITS 17 /* Setting the number of bits to 12, 13*/
 
#define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */
 
#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */
 
#define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/
 
/* 14 bits are selected. */
 
 
 
#if BITS == 20
 
#define TABLE_SIZE 1048583
 
#elif BITS == 19
 
#define TABLE_SIZE 524309
 
#elif BITS == 18
 
#define TABLE_SIZE 262147
 
#elif BITS == 17
 
#define TABLE_SIZE 131101
 
#elif BITS == 16
 
#define TABLE_SIZE 65543
 
#elif BITS == 15
 
#define TABLE_SIZE 32797
 
#elif BITS == 14
 
#define TABLE_SIZE 18041 /* The string table size needs to be a */
 
/* prime number that is somewhat larger*/
 
#elif BITS == 13 /* than 2**BITS. */
 
#define TABLE_SIZE 9029
 
#elif BITS == 12
 
#define TABLE_SIZE 5021
 
#else
 
#error define smaller or bigger table sizes
 
#endif
 
 
 
#if (TABLE_SIZE <= MAX_VALUE)
 
#error your prime numbers need attention
 
#endif
 
 
 
#if (BITS > MAX_CODE_LEN)
 
#error BITS can only go up to a maximum
 
#endif
 
 
 
 
 
/*
 
This class does most of the job, except reading source and writing the compressed data
 
A derived class does that so that there's flexibility to read either from files or memory
 
*/
 
 
 
class CLZWImpl {
 
protected:
 
int *code_value; /* This is the code value array */
 
unsigned int *prefix_code; /* This array holds the prefix codes */
 
unsigned char *append_character; /* This array holds the appended chars */
 
unsigned char decode_stack[4000]; /* This array holds the decoded string */
 
unsigned char CUR_BITS; /* ~nab: added for variable bit size */
 
/* we are processing bits but in the end of the day we do I/O in bytes */
 
int input_bit_count, output_bit_count;
 
unsigned long input_bit_buffer, output_bit_buffer;
 
 
 
public:
 
CLZWImpl() {
 
code_value = 0;
 
prefix_code = 0;
 
append_character = 0;
 
}
 
 
 
~CLZWImpl() {
 
if(code_value)
 
free(code_value);
 
if(prefix_code)
 
free(prefix_code);
 
if(append_character)
 
free(append_character);
 
}
 
 
 
int get_bits() { return CUR_BITS; }
 
 
 
protected:
 
int Init() {
 
ATLASSERT(!code_value); /* call just once */
 
 
 
code_value=(int*)malloc(TABLE_SIZE*sizeof(int));
 
prefix_code=(unsigned int*)malloc(TABLE_SIZE*sizeof(unsigned int));
 
append_character=(unsigned char*)malloc(TABLE_SIZE*sizeof(unsigned char));
 
 
 
return code_value != 0 && prefix_code != 0 && append_character != 0;
 
}
 
 
 
/* override these 4: read a byte from source */
 
virtual int getc_src() = 0;
 
/* read a byte from compressed source (during expansion) and write to compressed output */
 
virtual int getc_comp() = 0;
 
/* write a byte to compressed output */
 
virtual int putc_comp(int ch) = 0;
 
/* write a byte to expanded output */
 
virtual int putc_out(int ch) = 0;
 
 
 
/*
 
** This is the compression routine. The code should be a fairly close
 
** match to the algorithm accompanying the article.
 
**
 
*/
 
 
 
void compress()
 
{
 
unsigned int next_code;
 
unsigned int character;
 
unsigned int string_code;
 
unsigned int index;
 
unsigned int bit_limit;
 
int i;
 
 
 
ATLASSERT(code_value); /* initialized? */
 
 
 
CUR_BITS = MIN_CODE_LEN;
 
bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;
 
output_bit_count=0;
 
output_bit_buffer=0L;
 
 
 
ATLASSERT(256==FIRST_CODE);
 
next_code=FIRST_CODE; /* Next code is the next available string code*/
 
for (i=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */
 
code_value[i]=-1;
 
 
 
string_code=getc_src(); /* Get the first code */
 
if(-1 == string_code)
 
return; /* empty file or error */
 
 
 
/*
 
** This is the main loop where it all happens. This loop runs util all of
 
** the input has been exhausted. Note that it stops adding codes to the
 
** table after all of the possible codes have been defined.
 
*/
 
while ((character=getc_src()) != -1)
 
{
 
index=find_match(string_code,character);/* See if the string is in */
 
if (code_value[index] != -1) /* the table. If it is, */
 
string_code=code_value[index]; /* get the code value. If */
 
else /* the string is not in the*/
 
{ /* table, try to add it. */
 
if (next_code <= MAX_CODE)
 
{
 
code_value[index]=next_code++;
 
prefix_code[index]=string_code;
 
append_character[index]=character;
 
}
 
 
 
/* are we using enough bits to write out this code word? */
 
if(string_code >= bit_limit && CUR_BITS < BITS)
 
{
 
/* mark need for bigger code word with all ones */
 
output_code(bit_limit);
 
CUR_BITS++;
 
bit_limit = (CURRENT_MAX_CODES(CUR_BITS) - 1);
 
}
 
 
 
ATLASSERT(string_code < bit_limit);
 
 
 
output_code(string_code); /* When a string is found */
 
string_code=character; /* that is not in the table*/
 
} /* I output the last string*/
 
} /* after adding the new one*/
 
/*
 
** End of the main loop.
 
*/
 
 
 
output_code(string_code); /* Output the last code */
 
output_code(-1); /* This code flushes the output buffer*/
 
}
 
 
 
/*
 
** This is the hashing routine. It tries to find a match for the prefix+char
 
** string in the string table. If it finds it, the index is returned. If
 
** the string is not found, the first available index in the string table is
 
** returned instead.
 
*/
 
int find_match(unsigned int hash_prefix,unsigned int hash_character)
 
{
 
int index;
 
int offset;
 
 
 
index = (hash_character << HASHING_SHIFT) ^ hash_prefix;
 
if (index == 0)
 
offset = 1;
 
else
 
offset = TABLE_SIZE - index;
 
while (1)
 
{
 
if (code_value[index] == -1)
 
return(index);
 
if (prefix_code[index] == hash_prefix &&
 
append_character[index] == hash_character)
 
return(index);
 
index -= offset;
 
if (index < 0)
 
index += TABLE_SIZE;
 
}
 
}
 
 
 
/*
 
** This is the expansion routine. It takes an LZW format file, and expands
 
** it to an output file. The code here should be a fairly close match to
 
** the algorithm in the accompanying article.
 
*/
 
 
 
void expand()
 
{
 
unsigned int next_code;
 
unsigned int new_code;
 
unsigned int old_code;
 
int character;
 
unsigned char *string;
 
unsigned int bit_limit;
 
 
 
ATLASSERT(code_value); /* initialized? */
 
 
 
CUR_BITS = MIN_CODE_LEN;
 
bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;
 
input_bit_count=0;
 
input_bit_buffer=0L;
 
 
 
// @@@ what if we pass uncompressed file to decode?
 
 
 
next_code=FIRST_CODE; /* This is the next available code to define */
 
 
 
old_code=input_code(); /* Read in the first code, initialize the */
 
if(-1 == old_code)
 
return; /* read error? */
 
character=old_code; /* character variable, and send the first */
 
if(putc_out(old_code)==-1) /* code to the output file */
 
return; /* write error */
 
/*
 
** This is the main expansion loop. It reads in characters from the LZW file
 
** until it sees the special code used to inidicate the end of the data.
 
*/
 
while ((new_code=input_code()) != (-1))
 
{
 
/* look for code length increase marker */
 
if(bit_limit == new_code && CUR_BITS < BITS)
 
{
 
CUR_BITS++;
 
bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;
 
 
 
new_code=input_code();
 
ATLASSERT(new_code != -1); /* must be read error? */
 
if(new_code == -1)
 
break;
 
}
 
 
 
ATLASSERT(new_code < bit_limit);
 
 
 
/*
 
** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING
 
** case which generates an undefined code. It handles it by decoding
 
** the last code, and adding a single character to the end of the decode string.
 
*/
 
if (new_code>=next_code)
 
{
 
*decode_stack=character;
 
string=decode_string(decode_stack+1,old_code);
 
}
 
/*
 
** Otherwise we do a straight decode of the new code.
 
*/
 
else
 
string=decode_string(decode_stack,new_code);
 
/*
 
** Now we output the decoded string in reverse order.
 
*/
 
character=*string;
 
while (string >= decode_stack)
 
putc_out(*string--);
 
/*
 
** Finally, if possible, add a new code to the string table.
 
*/
 
if (next_code <= MAX_CODE)
 
{
 
prefix_code[next_code]=old_code;
 
append_character[next_code]=character;
 
next_code++;
 
}
 
old_code=new_code;
 
}
 
}
 
 
 
/*
 
** This routine simply decodes a string from the string table, storing
 
** it in a buffer. The buffer can then be output in reverse order by
 
** the expansion program.
 
*/
 
/* ~nab: these char* aren't a risk for unicode; we are reading bytes */
 
unsigned char *decode_string(unsigned char *buffer,unsigned int code)
 
{
 
int i;
 
 
 
i=0;
 
while (code >= FIRST_CODE)
 
{
 
*buffer++ = append_character[code];
 
code=prefix_code[code];
 
i++;
 
ATLASSERT(i < sizeof(decode_stack)); /* buffer overrun if it blows, increase stack size! */
 
}
 
*buffer=code;
 
return(buffer);
 
}
 
 
 
/*
 
** The following two routines are used to output variable length
 
** codes. They are written strictly for clarity, and are not
 
** particularyl efficient.
 
 
 
~nab: there's room for improvement in these I/O functions eg work in DWORDS instead of bytes
 
*/
 
 
 
unsigned int input_code()
 
{
 
int c;
 
unsigned int return_value;
 
//static int input_bit_count=0;
 
//static unsigned long input_bit_buffer=0L;
 
 
 
while (input_bit_count <= 24)
 
{
 
if ((c = getc_comp()) == -1)
 
break;
 
 
 
input_bit_buffer |=
 
(unsigned long) c << (24-input_bit_count);
 
input_bit_count += 8;
 
}
 
 
 
if(input_bit_count < CUR_BITS) {
 
ATLASSERT(!input_bit_buffer);
 
return -1; /* EOF */
 
}
 
 
 
return_value=input_bit_buffer >> (32-CUR_BITS);
 
input_bit_buffer <<= CUR_BITS;
 
input_bit_count -= CUR_BITS;
 
 
 
ATLASSERT(return_value < (1UL << CUR_BITS));
 
return(return_value);
 
}
 
 
 
/* bits are written outside normal byte boundaries, hence the need for keeping old values */
 
void output_code(unsigned int code)
 
{
 
//static int output_bit_count=0;
 
//static unsigned long output_bit_buffer=0L;
 
 
 
ATLASSERT(output_bit_count < 8); /* leftovers */
 
ATLASSERT(CUR_BITS + output_bit_count <= 32);
 
/*codes <256 are possible for single characters, zero bytes etc*/
 
 
 
if(-1 == code) {
 
/* pad remaining zeros and flush the last byte */
 
if(output_bit_count) {
 
output_bit_buffer >>= 24;
 
ATLASSERT((output_bit_buffer & 0xFF) == output_bit_buffer);
 
putc_comp(output_bit_buffer);
 
 
 
output_bit_count = 0;
 
output_bit_buffer = 0; /* in case some eejit calls us again */
 
}
 
 
 
return;
 
}
 
 
 
ATLASSERT(code < (1UL << CUR_BITS));
 
 
 
/* sends new bytes near the top (MSB) */
 
output_bit_buffer |= (unsigned long) code << (32-CUR_BITS-output_bit_count);
 
output_bit_count += CUR_BITS;
 
while (output_bit_count >= 8)
 
{
 
/* no check for error but if there was a problem we'd know from the time we wrote the identifier */
 
putc_comp(output_bit_buffer >> 24);
 
output_bit_buffer <<= 8;
 
output_bit_count -= 8;
 
}
 
}
 
}; /* CLZWImpl */
 
 
 
/* example derived class using C buffered I/O functions */
 
class CLZWCompressFile : public CLZWImpl {
 
public:
 
CLZWCompressFile() {
 
io_file = 0;
 
lzw_file = 0;
 
};
 
 
 
~CLZWCompressFile() {
 
ATLASSERT(!io_file);
 
ATLASSERT(!lzw_file);
 
};
 
 
 
int AnyIOErrors() {return io_error; }
 
 
 
// @@@ these char* should be changed for unicode builds
 
unsigned int Compress(char* input_file_name, char* to_name)
 
{
 
ATLASSERT(input_file_name && *input_file_name);
 
ATLASSERT(to_name && *to_name);
 
ATLASSERT(strcmp(to_name, input_file_name));
 
 
 
io_error = 1;
 
 
 
if(!code_value)
 
if(!Init())
 
return 0; /* rare memory error */
 
 
 
u_comp = 0;
 
u_io = 0;
 
io_file=fopen(input_file_name,"rb");
 
if(io_file) {
 
lzw_file=fopen(to_name,"wb");
 
if(lzw_file) {
 
/* write LZW identifier L+starting bytes */
 
putc('L', lzw_file);
 
if(putc(MIN_CODE_LEN, lzw_file) == MIN_CODE_LEN) {
 
compress();
 
io_error = ferror(lzw_file) || ferror(io_file);
 
if(!io_error)
 
ATLASSERT(u_comp <= u_io); /* this is bound to bomb every now and then, no compression! */
 
}
 
fclose(lzw_file);
 
lzw_file = 0;
 
}
 
 
 
fclose(io_file);
 
io_file = 0;
 
}
 
 
 
return u_comp;
 
}
 
 
 
unsigned int Expand(char* lzw_name, char* to_name)
 
{
 
ATLASSERT(lzw_name && *lzw_name);
 
ATLASSERT(to_name && *to_name);
 
ATLASSERT(strcmp(to_name, lzw_name));
 
 
 
io_error = 1;
 
 
 
if(!code_value)
 
if(!Init())
 
return 0; /* rare memory error */
 
 
 
u_comp = 0;
 
u_io = 0;
 
lzw_file=fopen(lzw_name,"rb");
 
if(lzw_file) {
 
/* check LZW identifier L+starting bytes */
 
int ch1 = getc(lzw_file);
 
int ch2 = getc(lzw_file);
 
if('L' == ch1 && MIN_CODE_LEN==ch2) {
 
io_file=fopen(to_name,"wb");
 
if(io_file) {
 
expand();
 
io_error = ferror(lzw_file) || ferror(io_file);
 
 
 
fclose(io_file);
 
io_file = 0;
 
}
 
}
 
 
 
fclose(lzw_file);
 
lzw_file = 0;
 
}
 
 
 
return u_io;
 
}
 
 
 
protected:
 
/* -1 return indicates either EOF or some IO error */
 
virtual int getc_src() {
 
ATLASSERT(io_file);
 
int ch = getc(io_file);
 
if(EOF == ch)
 
return -1;
 
 
 
u_io++;
 
return ch;
 
}
 
virtual int getc_comp() {
 
ATLASSERT(lzw_file);
 
int ch = getc(lzw_file);
 
if(EOF == ch)
 
return -1;
 
 
 
u_comp++;
 
return ch;
 
}
 
virtual int putc_comp(int ch) {
 
ATLASSERT(lzw_file);
 
ATLASSERT(ch >= 0 && ch < 256);
 
int ret = putc(ch, lzw_file);
 
 
 
if(ret != EOF) {
 
ATLASSERT(ret == ch);
 
u_comp++;
 
}
 
else
 
ret = -1;
 
 
 
return ret;
 
}
 
virtual int putc_out(int ch) {
 
ATLASSERT(io_file);
 
ATLASSERT(ch >= 0 && ch < 256);
 
int ret = putc(ch, io_file);
 
 
 
if(ret != EOF)
 
u_io++;
 
else
 
ret = -1;
 
 
 
return ret;
 
}
 
 
 
FILE* io_file;
 
FILE *lzw_file;
 
int io_error;
 
public:
 
unsigned long u_io, u_comp; /* bytes read and written */
 
};
 
 
 
// @@@ could have a generic one on IStream, CreateStreamOnHGlobal/SHCreateStreamOnFile
 
 
 
#endif /* UPRIGHT_LZW_H */
</pre>
----

Navigation menu