Difference between revisions of "Assorted Algorithm Alliteration"
Mdafidchao (talk | contribs) (Created page with '{{GPU610/DPS915 Index | 20123}} = Assorted Algorithm Alliteration = == Team Members == # [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz] # [mailto:elim2@mysene…') |
|||
(3 intermediate revisions by 2 users not shown) | |||
Line 5: | Line 5: | ||
== Team Members == | == Team Members == | ||
# [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz] | # [mailto:mddelacruz1@myseneca.ca?subject=GPU610 Mark de la Cruz] | ||
− | # [mailto:elim2@myseneca.ca?subject=GPU610 Edwin Lim] | + | # [mailto:elim2@myseneca.ca?subject=GPU610 Edwin Lim] - MD5 Checksum Calculator |
− | # [mailto:mdafidchao@myseneca.ca?subject=GPU610 Michael Afidchao] | + | # [mailto:mdafidchao@myseneca.ca?subject=GPU610 Michael Afidchao] - Game of Life |
# [mailto:mddelacruz1@myseneca.ca;elim2@myseneca.ca;mdafidchao@myseneca.ca?subject=GPU610 eMail All] | # [mailto:mddelacruz1@myseneca.ca;elim2@myseneca.ca;mdafidchao@myseneca.ca?subject=GPU610 eMail All] | ||
− | + | ===Game of Life - Michael Afidchao=== | |
− | === Game of Life === | ||
The Game of Life is a "0 player game" cellular automaton. With an initial configuration, | The Game of Life is a "0 player game" cellular automaton. With an initial configuration, | ||
the game uses a set of rules to determine what happens to the life forms from generation | the game uses a set of rules to determine what happens to the life forms from generation | ||
Line 78: | Line 77: | ||
</pre> | </pre> | ||
+ | ===MD5 Checksum Calculator - Edwin Lim=== | ||
+ | The md5 checksum is a commonly used hash function that produces a 128-bit hash value commonly used to check data integrity. It is also used in a wide variety of security applications, however its use in checking data integrity is what will be explored in this assignment. | ||
+ | |||
+ | The full source code for the MD5 checksum calculator can be found here: [http://www.codeforge.com/article/91071] | ||
+ | |||
+ | ===Profile=== | ||
+ | The following result (external link) was found after attempting to generate the md5 checksum for a 6.4gb file (Windows Vista ISO): | ||
+ | [http://i.imgur.com/B34r9.png]] | ||
+ | |||
+ | ===Code Snippet=== | ||
+ | From analysis of the above profile, the code to be targetted for optimization is the following: | ||
+ | <pre> | ||
+ | void _MD5_transform | ||
+ | (unsigned int state[4], | ||
+ | unsigned char block[64]) | ||
+ | { | ||
+ | |||
+ | unsigned int lA = state[0], lB = state[1], lC = state[2], lD = state[3]; | ||
+ | unsigned int x[16]; | ||
+ | |||
+ | _MD5_decode (x, block, 64); | ||
+ | |||
+ | // round 1 | ||
+ | FF ( lA, lB, lC, lD, x[ 0], S11, 0xd76aa478); // 1 | ||
+ | FF ( lD, lA, lB, lC, x[ 1], S12, 0xe8c7b756); // 2 | ||
+ | FF ( lC, lD, lA, lB, x[ 2], S13, 0x242070db); // 3 | ||
+ | FF ( lB, lC, lD, lA, x[ 3], S14, 0xc1bdceee); // 4 | ||
+ | FF ( lA, lB, lC, lD, x[ 4], S11, 0xf57c0faf); // 5 | ||
+ | FF ( lD, lA, lB, lC, x[ 5], S12, 0x4787c62a); // 6 | ||
+ | FF ( lC, lD, lA, lB, x[ 6], S13, 0xa8304613); // 7 | ||
+ | FF ( lB, lC, lD, lA, x[ 7], S14, 0xfd469501); // 8 | ||
+ | FF ( lA, lB, lC, lD, x[ 8], S11, 0x698098d8); // 9 | ||
+ | FF ( lD, lA, lB, lC, x[ 9], S12, 0x8b44f7af); // 10 | ||
+ | FF ( lC, lD, lA, lB, x[10], S13, 0xffff5bb1); // 11 | ||
+ | FF ( lB, lC, lD, lA, x[11], S14, 0x895cd7be); // 12 | ||
+ | FF ( lA, lB, lC, lD, x[12], S11, 0x6b901122); // 13 | ||
+ | FF ( lD, lA, lB, lC, x[13], S12, 0xfd987193); // 14 | ||
+ | FF ( lC, lD, lA, lB, x[14], S13, 0xa679438e); // 15 | ||
+ | FF ( lB, lC, lD, lA, x[15], S14, 0x49b40821); // 16 | ||
+ | |||
+ | // round 2 | ||
+ | GG ( lA, lB, lC, lD, x[ 1], S21, 0xf61e2562); // 17 | ||
+ | GG ( lD, lA, lB, lC, x[ 6], S22, 0xc040b340); // 18 | ||
+ | GG ( lC, lD, lA, lB, x[11], S23, 0x265e5a51); // 19 | ||
+ | GG ( lB, lC, lD, lA, x[ 0], S24, 0xe9b6c7aa); // 20 | ||
+ | GG ( lA, lB, lC, lD, x[ 5], S21, 0xd62f105d); // 21 | ||
+ | GG ( lD, lA, lB, lC, x[10], S22, 0x2441453); // 22 | ||
+ | GG ( lC, lD, lA, lB, x[15], S23, 0xd8a1e681); // 23 | ||
+ | GG ( lB, lC, lD, lA, x[ 4], S24, 0xe7d3fbc8); // 24 | ||
+ | GG ( lA, lB, lC, lD, x[ 9], S21, 0x21e1cde6); // 25 | ||
+ | GG ( lD, lA, lB, lC, x[14], S22, 0xc33707d6); // 26 | ||
+ | GG ( lC, lD, lA, lB, x[ 3], S23, 0xf4d50d87); // 27 | ||
+ | GG ( lB, lC, lD, lA, x[ 8], S24, 0x455a14ed); // 28 | ||
+ | GG ( lA, lB, lC, lD, x[13], S21, 0xa9e3e905); // 29 | ||
+ | GG ( lD, lA, lB, lC, x[ 2], S22, 0xfcefa3f8); // 30 | ||
+ | GG ( lC, lD, lA, lB, x[ 7], S23, 0x676f02d9); // 31 | ||
+ | GG ( lB, lC, lD, lA, x[12], S24, 0x8d2a4c8a); // 32 | ||
+ | |||
+ | // round 3 | ||
+ | HH ( lA, lB, lC, lD, x[ 5], S31, 0xfffa3942); // 33 | ||
+ | HH ( lD, lA, lB, lC, x[ 8], S32, 0x8771f681); // 34 | ||
+ | HH ( lC, lD, lA, lB, x[11], S33, 0x6d9d6122); // 35 | ||
+ | HH ( lB, lC, lD, lA, x[14], S34, 0xfde5380c); // 36 | ||
+ | HH ( lA, lB, lC, lD, x[ 1], S31, 0xa4beea44); // 37 | ||
+ | HH ( lD, lA, lB, lC, x[ 4], S32, 0x4bdecfa9); // 38 | ||
+ | HH ( lC, lD, lA, lB, x[ 7], S33, 0xf6bb4b60); // 39 | ||
+ | HH ( lB, lC, lD, lA, x[10], S34, 0xbebfbc70); // 40 | ||
+ | HH ( lA, lB, lC, lD, x[13], S31, 0x289b7ec6); // 41 | ||
+ | HH ( lD, lA, lB, lC, x[ 0], S32, 0xeaa127fa); // 42 | ||
+ | HH ( lC, lD, lA, lB, x[ 3], S33, 0xd4ef3085); // 43 | ||
+ | HH ( lB, lC, lD, lA, x[ 6], S34, 0x4881d05); // 44 | ||
+ | HH ( lA, lB, lC, lD, x[ 9], S31, 0xd9d4d039); // 45 | ||
+ | HH ( lD, lA, lB, lC, x[12], S32, 0xe6db99e5); // 46 | ||
+ | HH ( lC, lD, lA, lB, x[15], S33, 0x1fa27cf8); // 47 | ||
+ | HH ( lB, lC, lD, lA, x[ 2], S34, 0xc4ac5665); // 48 | ||
+ | |||
+ | // round 4 | ||
+ | II ( lA, lB, lC, lD, x[ 0], S41, 0xf4292244); // 49 | ||
+ | II ( lD, lA, lB, lC, x[ 7], S42, 0x432aff97); // 50 | ||
+ | II ( lC, lD, lA, lB, x[14], S43, 0xab9423a7); // 51 | ||
+ | II ( lB, lC, lD, lA, x[ 5], S44, 0xfc93a039); // 52 | ||
+ | II ( lA, lB, lC, lD, x[12], S41, 0x655b59c3); // 53 | ||
+ | II ( lD, lA, lB, lC, x[ 3], S42, 0x8f0ccc92); // 54 | ||
+ | II ( lC, lD, lA, lB, x[10], S43, 0xffeff47d); // 55 | ||
+ | II ( lB, lC, lD, lA, x[ 1], S44, 0x85845dd1); // 56 | ||
+ | II ( lA, lB, lC, lD, x[ 8], S41, 0x6fa87e4f); // 57 | ||
+ | II ( lD, lA, lB, lC, x[15], S42, 0xfe2ce6e0); // 58 | ||
+ | II ( lC, lD, lA, lB, x[ 6], S43, 0xa3014314); // 59 | ||
+ | II ( lB, lC, lD, lA, x[13], S44, 0x4e0811a1); // 60 | ||
+ | II ( lA, lB, lC, lD, x[ 4], S41, 0xf7537e82); // 61 | ||
+ | II ( lD, lA, lB, lC, x[11], S42, 0xbd3af235); // 62 | ||
+ | II ( lC, lD, lA, lB, x[ 2], S43, 0x2ad7d2bb); // 63 | ||
+ | II ( lB, lC, lD, lA, x[ 9], S44, 0xeb86d391); // 64 | ||
+ | |||
+ | state[0] += lA; | ||
+ | state[1] += lB; | ||
+ | state[2] += lC; | ||
+ | state[3] += lD; | ||
+ | |||
+ | // lClear sensitive information | ||
+ | memset(x, 0, 16); | ||
+ | } | ||
+ | </pre> | ||
+ | |||
+ | === Bzip2 - Mark de la Cruz === | ||
+ | From http://www.bzip.org/: bzip2 is a freely available, patent free (see below), high-quality data compressor. It typically compresses files to within 10% to 15% of the best available techniques (the PPM family of statistical compressors), whilst being around twice as fast at compression and six times faster at decompression. | ||
+ | The full source can be found in the link above. | ||
+ | |||
+ | Below is the profile and the function to be optimized: | ||
+ | <pre> | ||
+ | |||
+ | Flat profile: | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | Each sample counts as 0.01 seconds. | ||
+ | |||
+ | % cumulative self self total | ||
+ | |||
+ | time seconds seconds calls s/call s/call name | ||
+ | |||
+ | 63.21 7.01 7.01 23 0.30 0.47 BZ2_compressBlock | ||
+ | |||
+ | 28.85 10.21 3.20 23 0.14 0.14 mainSort | ||
+ | |||
+ | 4.78 10.74 0.53 1 0.53 0.53 fallbackSort | ||
+ | |||
+ | 2.34 11.00 0.26 8280 0.00 0.00 default_bzalloc | ||
+ | |||
+ | 0.54 11.06 0.06 23 0.00 0.16 BZ2_blockSort | ||
+ | |||
+ | 0.27 11.09 0.03 552 0.00 0.00 BZ2_hbMakeCodeLengths | ||
+ | |||
+ | 0.00 11.09 0.00 113812 0.00 0.00 add_pair_to_block | ||
+ | |||
+ | 0.00 11.09 0.00 58370 0.00 0.00 _init | ||
+ | |||
+ | 0.00 11.09 0.00 8276 0.00 0.00 BZ2_bzCompress | ||
+ | |||
+ | 0.00 11.09 0.00 4159 0.00 0.00 myfeof | ||
+ | |||
+ | 0.00 11.09 0.00 4158 0.00 0.00 BZ2_bzWrite | ||
+ | |||
+ | 0.00 11.09 0.00 148 0.00 0.00 bsPutUChar | ||
+ | |||
+ | 0.00 11.09 0.00 138 0.00 0.00 BZ2_hbAssignCodes | ||
+ | |||
+ | 0.00 11.09 0.00 24 0.00 0.00 bsPutUInt32 | ||
+ | |||
+ | 0.00 11.09 0.00 4 0.00 0.00 default_bzfree | ||
+ | |||
+ | 0.00 11.09 0.00 4 0.00 0.00 hasSuffix | ||
+ | |||
+ | 0.00 11.09 0.00 4 0.00 0.00 testf | ||
+ | |||
+ | 0.00 11.09 0.00 2 0.00 0.00 copyFileName | ||
+ | |||
+ | 0.00 11.09 0.00 2 0.00 0.00 fileExists | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 0.00 BZ2_bzCompressEnd | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 0.01 BZ2_bzCompressInit | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 0.24 BZ2_bzWriteClose64 | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 0.01 BZ2_bzWriteOpen | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 0.00 applySavedFileAttrToOutputFile | ||
+ | |||
+ | 0.00 11.09 0.00 1 0.00 11.09 compressStream | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |===============================================================================| | ||
+ | |||
+ | Source code to compressBlock | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | /*---------------------------------------------------*/ | ||
+ | |||
+ | void BZ2_compressBlock ( EState* s, Bool is_last_block ) | ||
+ | |||
+ | { | ||
+ | |||
+ | if (s->nblock > 0) { | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | BZ_FINALISE_CRC ( s->blockCRC ); | ||
+ | |||
+ | s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31); | ||
+ | |||
+ | s->combinedCRC ^= s->blockCRC; | ||
+ | |||
+ | if (s->blockNo > 1) s->numZ = 0; | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | if (s->verbosity >= 2) | ||
+ | |||
+ | VPrintf4( " block %d: crc = 0x%08x, " | ||
+ | |||
+ | "combined CRC = 0x%08x, size = %d\n", | ||
+ | |||
+ | s->blockNo, s->blockCRC, s->combinedCRC, s->nblock ); | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | BZ2_blockSort ( s ); | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]); | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | /*-- If this is the first block, create the stream header. --*/ | ||
+ | |||
+ | if (s->blockNo == 1) { | ||
+ | |||
+ | BZ2_bsInitWrite ( s ); | ||
+ | |||
+ | bsPutUChar ( s, BZ_HDR_B ); | ||
+ | |||
+ | bsPutUChar ( s, BZ_HDR_Z ); | ||
+ | |||
+ | bsPutUChar ( s, BZ_HDR_h ); | ||
+ | |||
+ | bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) ); | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | if (s->nblock > 0) { | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 ); | ||
+ | |||
+ | bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 ); | ||
+ | |||
+ | bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 ); | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | /*-- Now the block's CRC, so it is in a known place. --*/ | ||
+ | |||
+ | bsPutUInt32 ( s, s->blockCRC ); | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | /*-- | ||
+ | |||
+ | Now a single bit indicating (non-)randomisation. | ||
+ | |||
+ | As of version 0.9.5, we use a better sorting algorithm | ||
+ | |||
+ | which makes randomisation unnecessary. So always set | ||
+ | |||
+ | the randomised bit to 'no'. Of course, the decoder | ||
+ | |||
+ | still needs to be able to handle randomised blocks | ||
+ | |||
+ | so as to maintain backwards compatibility with | ||
+ | |||
+ | older versions of bzip2. | ||
+ | |||
+ | --*/ | ||
+ | |||
+ | bsW(s,1,0); | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | bsW ( s, 24, s->origPtr ); | ||
+ | |||
+ | generateMTFValues ( s ); | ||
+ | |||
+ | sendMTFValues ( s ); | ||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | /*-- If this is the last block, add the stream trailer. --*/ | ||
+ | |||
+ | if (is_last_block) { | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 ); | ||
+ | |||
+ | bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 ); | ||
+ | |||
+ | bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 ); | ||
+ | |||
+ | bsPutUInt32 ( s, s->combinedCRC ); | ||
+ | |||
+ | if (s->verbosity >= 2) | ||
+ | |||
+ | VPrintf1( " final combined CRC = 0x%08x\n ", s->combinedCRC ); | ||
+ | |||
+ | bsFinishWrite ( s ); | ||
+ | |||
+ | } | ||
+ | |||
+ | } | ||
+ | |||
+ | </pre> | ||
== Instructor's Comments == | == Instructor's Comments == |
Latest revision as of 09:34, 11 October 2012
GPU610/DPS915 | Student List | Group and Project Index | Student Resources | Glossary
Contents
Assorted Algorithm Alliteration
Team Members
- Mark de la Cruz
- Edwin Lim - MD5 Checksum Calculator
- Michael Afidchao - Game of Life
- eMail All
Game of Life - Michael Afidchao
The Game of Life is a "0 player game" cellular automaton. With an initial configuration, the game uses a set of rules to determine what happens to the life forms from generation to generation. More information can be found at http://en.wikipedia.org/wiki/Conway%27s_Game_of_Life
Source code can be found here:
Profile
These results were taken with an execution of 50000 generations and default settings for the rest of the options.
Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls Ts/call Ts/call name 55.82 14.76 14.76 eval_rules 41.75 25.80 11.04 do_draw 2.23 26.39 0.59 update_grid 0.19 26.44 0.05 copy_bounds 0.00 26.44 0.00 11025 0.00 0.00 rand_double 0.00 26.44 0.00 1 0.00 0.00 allocate_grids 0.00 26.44 0.00 1 0.00 0.00 free_grids 0.00 26.44 0.00 1 0.00 0.00 init_grids 0.00 26.44 0.00 1 0.00 0.00 moveWindow 0.00 26.44 0.00 1 0.00 0.00 parse_args 0.00 26.44 0.00 1 0.00 0.00 randomize_grid 0.00 26.44 0.00 1 0.00 0.00 setupWindow 0.00 26.44 0.00 1 0.00 0.00 write_grid
Code Snippet
/* eval_rules() Evaluate the rules of Life for each cell; count neighbors and update current state accordingly. */ void eval_rules (struct life_t * life) { int i,j,k,l,neighbors; int ncols = life->ncols; int nrows = life->nrows; int ** grid = life->grid; int ** next_grid = life->next_grid; for (i = 1; i <= ncols; i++) { for (j = 1; j <= nrows; j++) { neighbors = 0; // count neighbors for (k = i-1; k <= i+1; k++) { for (l = j-1; l <= j+1; l++) { if (!(k == i && l == j) && grid[k][l] != DEAD) neighbors++; } } // update state if (neighbors < LOWER_THRESH || neighbors > UPPER_THRESH) next_grid[i][j] = DEAD; else if (grid[i][j] != DEAD || neighbors == SPAWN_THRESH) next_grid[i][j] = grid[i][j]+1; } } }
MD5 Checksum Calculator - Edwin Lim
The md5 checksum is a commonly used hash function that produces a 128-bit hash value commonly used to check data integrity. It is also used in a wide variety of security applications, however its use in checking data integrity is what will be explored in this assignment.
The full source code for the MD5 checksum calculator can be found here: [1]
Profile
The following result (external link) was found after attempting to generate the md5 checksum for a 6.4gb file (Windows Vista ISO): [2]]
Code Snippet
From analysis of the above profile, the code to be targetted for optimization is the following:
void _MD5_transform (unsigned int state[4], unsigned char block[64]) { unsigned int lA = state[0], lB = state[1], lC = state[2], lD = state[3]; unsigned int x[16]; _MD5_decode (x, block, 64); // round 1 FF ( lA, lB, lC, lD, x[ 0], S11, 0xd76aa478); // 1 FF ( lD, lA, lB, lC, x[ 1], S12, 0xe8c7b756); // 2 FF ( lC, lD, lA, lB, x[ 2], S13, 0x242070db); // 3 FF ( lB, lC, lD, lA, x[ 3], S14, 0xc1bdceee); // 4 FF ( lA, lB, lC, lD, x[ 4], S11, 0xf57c0faf); // 5 FF ( lD, lA, lB, lC, x[ 5], S12, 0x4787c62a); // 6 FF ( lC, lD, lA, lB, x[ 6], S13, 0xa8304613); // 7 FF ( lB, lC, lD, lA, x[ 7], S14, 0xfd469501); // 8 FF ( lA, lB, lC, lD, x[ 8], S11, 0x698098d8); // 9 FF ( lD, lA, lB, lC, x[ 9], S12, 0x8b44f7af); // 10 FF ( lC, lD, lA, lB, x[10], S13, 0xffff5bb1); // 11 FF ( lB, lC, lD, lA, x[11], S14, 0x895cd7be); // 12 FF ( lA, lB, lC, lD, x[12], S11, 0x6b901122); // 13 FF ( lD, lA, lB, lC, x[13], S12, 0xfd987193); // 14 FF ( lC, lD, lA, lB, x[14], S13, 0xa679438e); // 15 FF ( lB, lC, lD, lA, x[15], S14, 0x49b40821); // 16 // round 2 GG ( lA, lB, lC, lD, x[ 1], S21, 0xf61e2562); // 17 GG ( lD, lA, lB, lC, x[ 6], S22, 0xc040b340); // 18 GG ( lC, lD, lA, lB, x[11], S23, 0x265e5a51); // 19 GG ( lB, lC, lD, lA, x[ 0], S24, 0xe9b6c7aa); // 20 GG ( lA, lB, lC, lD, x[ 5], S21, 0xd62f105d); // 21 GG ( lD, lA, lB, lC, x[10], S22, 0x2441453); // 22 GG ( lC, lD, lA, lB, x[15], S23, 0xd8a1e681); // 23 GG ( lB, lC, lD, lA, x[ 4], S24, 0xe7d3fbc8); // 24 GG ( lA, lB, lC, lD, x[ 9], S21, 0x21e1cde6); // 25 GG ( lD, lA, lB, lC, x[14], S22, 0xc33707d6); // 26 GG ( lC, lD, lA, lB, x[ 3], S23, 0xf4d50d87); // 27 GG ( lB, lC, lD, lA, x[ 8], S24, 0x455a14ed); // 28 GG ( lA, lB, lC, lD, x[13], S21, 0xa9e3e905); // 29 GG ( lD, lA, lB, lC, x[ 2], S22, 0xfcefa3f8); // 30 GG ( lC, lD, lA, lB, x[ 7], S23, 0x676f02d9); // 31 GG ( lB, lC, lD, lA, x[12], S24, 0x8d2a4c8a); // 32 // round 3 HH ( lA, lB, lC, lD, x[ 5], S31, 0xfffa3942); // 33 HH ( lD, lA, lB, lC, x[ 8], S32, 0x8771f681); // 34 HH ( lC, lD, lA, lB, x[11], S33, 0x6d9d6122); // 35 HH ( lB, lC, lD, lA, x[14], S34, 0xfde5380c); // 36 HH ( lA, lB, lC, lD, x[ 1], S31, 0xa4beea44); // 37 HH ( lD, lA, lB, lC, x[ 4], S32, 0x4bdecfa9); // 38 HH ( lC, lD, lA, lB, x[ 7], S33, 0xf6bb4b60); // 39 HH ( lB, lC, lD, lA, x[10], S34, 0xbebfbc70); // 40 HH ( lA, lB, lC, lD, x[13], S31, 0x289b7ec6); // 41 HH ( lD, lA, lB, lC, x[ 0], S32, 0xeaa127fa); // 42 HH ( lC, lD, lA, lB, x[ 3], S33, 0xd4ef3085); // 43 HH ( lB, lC, lD, lA, x[ 6], S34, 0x4881d05); // 44 HH ( lA, lB, lC, lD, x[ 9], S31, 0xd9d4d039); // 45 HH ( lD, lA, lB, lC, x[12], S32, 0xe6db99e5); // 46 HH ( lC, lD, lA, lB, x[15], S33, 0x1fa27cf8); // 47 HH ( lB, lC, lD, lA, x[ 2], S34, 0xc4ac5665); // 48 // round 4 II ( lA, lB, lC, lD, x[ 0], S41, 0xf4292244); // 49 II ( lD, lA, lB, lC, x[ 7], S42, 0x432aff97); // 50 II ( lC, lD, lA, lB, x[14], S43, 0xab9423a7); // 51 II ( lB, lC, lD, lA, x[ 5], S44, 0xfc93a039); // 52 II ( lA, lB, lC, lD, x[12], S41, 0x655b59c3); // 53 II ( lD, lA, lB, lC, x[ 3], S42, 0x8f0ccc92); // 54 II ( lC, lD, lA, lB, x[10], S43, 0xffeff47d); // 55 II ( lB, lC, lD, lA, x[ 1], S44, 0x85845dd1); // 56 II ( lA, lB, lC, lD, x[ 8], S41, 0x6fa87e4f); // 57 II ( lD, lA, lB, lC, x[15], S42, 0xfe2ce6e0); // 58 II ( lC, lD, lA, lB, x[ 6], S43, 0xa3014314); // 59 II ( lB, lC, lD, lA, x[13], S44, 0x4e0811a1); // 60 II ( lA, lB, lC, lD, x[ 4], S41, 0xf7537e82); // 61 II ( lD, lA, lB, lC, x[11], S42, 0xbd3af235); // 62 II ( lC, lD, lA, lB, x[ 2], S43, 0x2ad7d2bb); // 63 II ( lB, lC, lD, lA, x[ 9], S44, 0xeb86d391); // 64 state[0] += lA; state[1] += lB; state[2] += lC; state[3] += lD; // lClear sensitive information memset(x, 0, 16); }
Bzip2 - Mark de la Cruz
From http://www.bzip.org/: bzip2 is a freely available, patent free (see below), high-quality data compressor. It typically compresses files to within 10% to 15% of the best available techniques (the PPM family of statistical compressors), whilst being around twice as fast at compression and six times faster at decompression. The full source can be found in the link above.
Below is the profile and the function to be optimized:
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls s/call s/call name 63.21 7.01 7.01 23 0.30 0.47 BZ2_compressBlock 28.85 10.21 3.20 23 0.14 0.14 mainSort 4.78 10.74 0.53 1 0.53 0.53 fallbackSort 2.34 11.00 0.26 8280 0.00 0.00 default_bzalloc 0.54 11.06 0.06 23 0.00 0.16 BZ2_blockSort 0.27 11.09 0.03 552 0.00 0.00 BZ2_hbMakeCodeLengths 0.00 11.09 0.00 113812 0.00 0.00 add_pair_to_block 0.00 11.09 0.00 58370 0.00 0.00 _init 0.00 11.09 0.00 8276 0.00 0.00 BZ2_bzCompress 0.00 11.09 0.00 4159 0.00 0.00 myfeof 0.00 11.09 0.00 4158 0.00 0.00 BZ2_bzWrite 0.00 11.09 0.00 148 0.00 0.00 bsPutUChar 0.00 11.09 0.00 138 0.00 0.00 BZ2_hbAssignCodes 0.00 11.09 0.00 24 0.00 0.00 bsPutUInt32 0.00 11.09 0.00 4 0.00 0.00 default_bzfree 0.00 11.09 0.00 4 0.00 0.00 hasSuffix 0.00 11.09 0.00 4 0.00 0.00 testf 0.00 11.09 0.00 2 0.00 0.00 copyFileName 0.00 11.09 0.00 2 0.00 0.00 fileExists 0.00 11.09 0.00 1 0.00 0.00 BZ2_bzCompressEnd 0.00 11.09 0.00 1 0.00 0.01 BZ2_bzCompressInit 0.00 11.09 0.00 1 0.00 0.24 BZ2_bzWriteClose64 0.00 11.09 0.00 1 0.00 0.01 BZ2_bzWriteOpen 0.00 11.09 0.00 1 0.00 0.00 applySavedFileAttrToOutputFile 0.00 11.09 0.00 1 0.00 11.09 compressStream |===============================================================================| Source code to compressBlock /*---------------------------------------------------*/ void BZ2_compressBlock ( EState* s, Bool is_last_block ) { if (s->nblock > 0) { BZ_FINALISE_CRC ( s->blockCRC ); s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31); s->combinedCRC ^= s->blockCRC; if (s->blockNo > 1) s->numZ = 0; if (s->verbosity >= 2) VPrintf4( " block %d: crc = 0x%08x, " "combined CRC = 0x%08x, size = %d\n", s->blockNo, s->blockCRC, s->combinedCRC, s->nblock ); BZ2_blockSort ( s ); } s->zbits = (UChar*) (&((UChar*)s->arr2)[s->nblock]); /*-- If this is the first block, create the stream header. --*/ if (s->blockNo == 1) { BZ2_bsInitWrite ( s ); bsPutUChar ( s, BZ_HDR_B ); bsPutUChar ( s, BZ_HDR_Z ); bsPutUChar ( s, BZ_HDR_h ); bsPutUChar ( s, (UChar)(BZ_HDR_0 + s->blockSize100k) ); } if (s->nblock > 0) { bsPutUChar ( s, 0x31 ); bsPutUChar ( s, 0x41 ); bsPutUChar ( s, 0x59 ); bsPutUChar ( s, 0x26 ); bsPutUChar ( s, 0x53 ); bsPutUChar ( s, 0x59 ); /*-- Now the block's CRC, so it is in a known place. --*/ bsPutUInt32 ( s, s->blockCRC ); /*-- Now a single bit indicating (non-)randomisation. As of version 0.9.5, we use a better sorting algorithm which makes randomisation unnecessary. So always set the randomised bit to 'no'. Of course, the decoder still needs to be able to handle randomised blocks so as to maintain backwards compatibility with older versions of bzip2. --*/ bsW(s,1,0); bsW ( s, 24, s->origPtr ); generateMTFValues ( s ); sendMTFValues ( s ); } /*-- If this is the last block, add the stream trailer. --*/ if (is_last_block) { bsPutUChar ( s, 0x17 ); bsPutUChar ( s, 0x72 ); bsPutUChar ( s, 0x45 ); bsPutUChar ( s, 0x38 ); bsPutUChar ( s, 0x50 ); bsPutUChar ( s, 0x90 ); bsPutUInt32 ( s, s->combinedCRC ); if (s->verbosity >= 2) VPrintf1( " final combined CRC = 0x%08x\n ", s->combinedCRC ); bsFinishWrite ( s ); } }