Changes

← Older edit

DPS915/M-N-M

2,127 bytes removed, 16:55, 12 April 2013

→‎Final version's errors, warnings and observations

</pre>

~~=== Assignment 1 ===~~

==== Muhammad Ahsan: Prime Number Generator( 1,000,000,000 primes) ====

</pre>

~~=== Assignment 1 ===~~

~~==== Nitin Prakash Panicker: LZW File Compression ====~~

~~<pre>~~

~~Flat profile:~~

~~Each sample counts as 0.01 seconds.~~

~~% cumulative self self total~~

~~time seconds seconds calls ns/call ns/call name~~

~~99.46 48.19 48.19 CLZWCompressFile::Compress(char*, char*)~~

~~0.33 48.35 0.16 17122488 9.34 9.34 CLZWCompressFile::getc_src()~~

~~0.21 48.45 0.10 7095561 14.09 14.09 CLZWCompressFile::putc_comp(int)~~

~~</pre>~~

~~=== lzw.cpp ===~~

~~[[Media:lzw.cpp]]~~

<pre>

~~#include <time.h>~~ ~~#include "lzw.h"~~ /**~~******************************************************************~~ ** ** This program gets a file name from the command line{{{ http://code. ~~It compresses the~~ ** file, placing its output in a file named testactivestate.~~lzw. It then expands~~ ** test.lzw into test.out. Test.out should then be an exact duplicate of ** the input file. ** ~~*************************************************************************~~com/recipes/576559/ ~~main~~(~~int argc, char~~ r2) *~~argv[])~~ { ~~clock_t timer;~~ ~~CLZWCompressFile lzw;~~ /

/*

** Get the file name Permission is hereby granted, ~~open it up~~free of charge, to any person obtaining a copy of this software and ~~open up~~ associated documentation files (the ~~lzw output file.~~"Software"), to deal in the Software without restriction, including without limitation the rights* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is ~~if (argc==1)~~ { furnished to do so, subject to the following conditions:

~~printf("Input file name to compress?\n");~~ The above copyright notice and this permission notice shall be included in ~~return 0;~~ } ~~printf("testing %s...\n", argv[1]);~~ /* ** Compress all copies or substantial portions of the ~~file~~Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN

THE SOFTWARE.

*/

~~timer = clock();~~ ~~int crunch = lzw.Compress(argv[1], "test.lzw");~~ ~~timer = clock() - timer; //CLOCKS_PER_SEC~~ ~~printf("compress time=%d ms, encoding=%d, size=%u", timer, lzw.get_bits(), crunch);~~ ~~int filesize = lzw.u_io;~~ ~~printf(" (ratio=%d%%)\n", filesize ? (filesize-crunch)*100/filesize : 0);~~ ~~if(lzw.AnyIOErrors())~~ ~~printf("***I/O ERROR***\n");~~ /* ** Expand the file. */ ~~timer = clock();~~ ~~int orig = lzw.Expand("test.lzw", "test.out");~~ ~~timer = clock() - timer; //CLOCKS_PER_SEC~~ ~~printf("expand time=%d ms, encoding=%d\n", timer, lzw.get_bits());~~ ~~if(lzw.AnyIOErrors())~~ ~~printf("***I/O ERROR***\n");~~ ~~ATLASSERT(filesize == orig); // did we mangle the file?~~ ~~return 0;~~ }~~</pre>~~ ~~=== lzw.h ===~~ ~~<pre>~~ ~~#ifndef UPRIGHT_LZW_H~~ ~~#define UPRIGHT_LZW_H~~ ~~/* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52~~ * ---------------------------------------------------------------------------------- * * Plain C++ port of LZW compression algorithm and code originally (c) Mark R. Nelson * http://marknelson.us/1989/10/01/lzw-data-compression * Variable bit length encoding idea and code originally by Michael Dipperstein * http://michael.dipperstein.com/lzw * * There are a lot of compression classes floating around but most are based on the * zlib (zip/unzip) library, which is good but a bit of overkill for simple and small * code. LZW combines decent compression ratios with very small code footprint. If * you need something more powerful here are a few resources: * * http://www.codeproject.com/KB/files/zip_utils.aspx * http://www.codeproject.com/KB/cpp/xzipunzip.aspx * http://www.codeproject.com/KB/cpp/ChauMemzip.aspx * * Microsoft types can check the CAB protocol that is available in all windows: * http://www.codeproject.com/KB/files/CABCompressExtract.aspx * http://msdn.microsoft.com/en-us/library/bb417343.aspx * */ ~~#include <stdio.h>~~ #include <~~stdlib.h~~iostream> #include <~~limits.h~~vector>

#include <string.h>

~~#include <assert.h>~~ ~~#define ATLASSERT assert~~ ~~/* NOTE: function and variable names left as much as possible matching the original~~ ~~LZW.c by Mark, naturally bundled in classes to get rid of static/globals etc~~ */ ~~#define MIN_CODE_LEN 9 /* min # bits in a code word */~~ ~~#define MAX_CODE_LEN 20 /* max # bits in a code word */~~ ~~#define CURRENT_MAX_CODES(x) (1UL << (x))~~ ~~#define FIRST_CODE (1 << CHAR_BIT) /* value of 1st string code */~~ ~~#if (MIN_CODE_LEN <= CHAR_BIT)~~ ~~#error Code words must be larger than 1 character~~ ~~#endif~~ ~~#if (MAX_CODE_LEN >= 25)~~ ~~#error Code words must fit in an integer~~ ~~#endif~~ ~~/* VARIABLE BIT LENGTH ENCODING~~ * Instead of using ~~a fixed number of bits for code words, we start at 9 (=MIN_CODE_LEN)~~ * and go up to BITS (<=MAX_CODE_LEN) so that small files are tightly packed and larger * files are fine too. The BITS constant determines the maximum hash table size. For 18 * this means 250KB runtime table size which is enough for files ~4MB. * There is no problem for files larger than that; if we run out of table space for new * codes then the same codes are emitted (uncompressed obviously) */ ~~#define BITS 17 /* Setting the number of bits to 12, 13*/~~ ~~#define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */~~ ~~#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */~~ ~~#define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/~~ ~~/* 14 bits are selected. */~~ ~~#if BITS == 20~~ ~~#define TABLE_SIZE 1048583~~ ~~#elif BITS == 19~~ ~~#define TABLE_SIZE 524309~~ ~~#elif BITS == 18~~ ~~#define TABLE_SIZE 262147~~ ~~#elif BITS == 17~~ ~~#define TABLE_SIZE 131101~~ ~~#elif BITS == 16~~ ~~#define TABLE_SIZE 65543~~ ~~#elif BITS == 15~~ ~~#define TABLE_SIZE 32797~~ ~~#elif BITS == 14~~ ~~#define TABLE_SIZE 18041 /* The string table size needs to be a */~~ ~~/* prime number that is somewhat larger*/~~ ~~#elif BITS == 13 /* than 2**BITS. */~~ ~~#define TABLE_SIZE 9029~~ ~~#elif BITS == 12~~ ~~#define TABLE_SIZE 5021~~ ~~#else~~ ~~#error define smaller or bigger table sizes~~ ~~#endif~~ ~~#if (TABLE_SIZE <= MAX_VALUE)~~ ~~#error your prime numbers need attention~~ ~~#endif~~ ~~#if (BITS > MAX_CODE_LEN)~~ ~~#error BITS can only go up to a maximum~~ ~~#endif~~ /* ~~This class does most of the job, except reading source and writing the compressed data~~ ~~A derived class does that so that there's flexibility to read either from files or memory~~ */ ~~class CLZWImpl {~~ ~~protected:~~ ~~int *code_value; /* This is the code value array */~~ ~~unsigned int *prefix_code; /* This array holds the prefix codes */~~ ~~unsigned char *append_character; /* This array holds the appended chars */~~ ~~unsigned char decode_stack[4000]; /* This array holds the decoded string */~~ ~~unsigned char CUR_BITS; /* ~nab: added for variable bit size */~~ ~~/* we are processing bits but in the end of the day we do I/O in bytes */~~ ~~int input_bit_count, output_bit_count;~~ ~~unsigned long input_bit_buffer, output_bit_buffer;~~ ~~public:~~ ~~CLZWImpl() {~~ ~~code_value = 0;~~ ~~prefix_code = 0;~~ ~~append_character = 0~~namespace std;

vector<unsigned long> get_primes(unsigned long max){

vector<unsigned long> primes;

char *sieve;

sieve = new char[max/8+1];

// Fill sieve with 1

memset(sieve, 0xFF, (max/8+1) * sizeof(char));

for(unsigned long x = 2; x <= max; x++)

if(sieve[x/8] & (0x01 << (x % 8))){

primes.push_back(x);

// Is prime. Mark multiplicates.

for(unsigned long j = 2*x; j <= max; j += x)

sieve[j/8] &= ~(0x01 << (j % 8));

}

delete[] sieve;

return primes;

}

~~~CLZWImpl~~int main(void) { vector<unsigned long> primes; if primes = get_primes(~~code_value~~1000000000); // return 0; ~~free(code_value)~~ // Print out result. vector<unsigned long>::iterator it; if for(it=primes.begin(~~prefix_code~~) ~~free~~; it < primes.end(~~prefix_code~~);it++) cout << *it << " "<<endl; ~~if(append_character)~~ ~~free(append_character)~~ cout << endl; return 0;

}

/** end of http://code.activestate.com/recipes/576559/ }}} */

</pre>

==== Nitin Prakash Panicker: LZW File Compression ====

~~int get_bits() { return CUR_BITS; }~~<pre>

Flat profile:

~~protected:~~

~~int Init() {~~Each sample counts as 0.01 seconds.

~~ATLASSERT(!code_value); /* call just once */~~ % cumulative self self total

time seconds seconds calls ns/call ns/call name

99.46 48.19 48.19 CLZWCompressFile::Compress(char*, char*)

~~code_value=~~ 0.33 48.35 0.16 17122488 9.34 9.34 CLZWCompressFile::getc_src(int*)~~malloc(TABLE_SIZE*sizeof(int));~~

~~prefix_code=~~ 0.21 48.45 0.10 7095561 14.09 14.09 CLZWCompressFile::putc_comp(~~unsigned~~ int*)~~malloc(TABLE_SIZE*sizeof(unsigned int));~~

~~append_character=(unsigned char*)malloc(TABLE_SIZE*sizeof(unsigned char));~~</pre>

==== Source Code for LZW File Compression ====

~~return code_value != 0 && prefix_code != 0 && append_character != 0;~~

}[[lzw.cpp]][[lzw.h]]

=== Assignment 2 ===

==== Source code for prime number generator we will be putting on the gpu ====

<pre>

# include <cmath> /* override these 4/ This library enable the use of sqrt. # include <iostream> using namespace std; void primenum(long double); // Prototype... int c = 0; int main(){ long double x = 0; cout<<"\n This program will generate all prime numbers up to the" <<"\n number you have entered below...\n"; cout<<"\n Please enter a number: read a byte from source *"; cin>> x; cout<<"\n Here are all the prime numbers up to "<<x<<".\n"; primenum(x); //function invocation... cout<<endl<<"\nThere are "<<c <<" prime numbers less than or equal to "<<x<<".\n\n"; return 0; } // This function will determine the primenumbers up to num. void primenum(long double x){ bool prime = true; //Calculates the square-root of 'x' int number2; number2 =(int) floor (sqrt (x)); for (int i = 1; i <= x; i++){ for ( int j = 2; j <= number2; j++){ if ( i!=j && i % j == 0 ){ prime = false; break; } } if (prime){ cout <<" "<<i<<" "; c += 1; } prime = true; } getchar(); }

~~virtual int getc_src()~~ </pre>= 0;=== Version of prime generator running on GPU ====<pre>

# include <cmath> /* read / This library enable the use of sqrt. # include <iostream> # include <cuda_runtime.h> using namespace std; void primenum(long double); // Prototype... int c = 0; int main(){ long double x = 0; cout<<"\n This program will generate all prime numbers up to the" <<"\n number you have entered below...\n"; cout<<"\n Please enter a ~~byte from compressed source~~ number: "; cin>> x; cout<<"\n Here are all the prime numbers up to "<<x<<".\n"; primenum(x); //function invocation... cout<<endl<<"\nThere are "<<c <<" prime numbers less than or equal to "<<x<<".\n\n"; return 0; } // This function will determine the primenumbers up to num. void primenum(~~during expansion~~long double x) { //Array to hold generated primes on host int *primes_h = new int[x]; //Device array to hold the primes on the device int *primes_d = new int[x]; //allocate device memory and ~~write~~ initialize device memory cudaMalloc((void**)&primes_d, x * sizeof(int)); cudaMemset(&primes_d,sizeof(int),x * sizeof(int); //Kernal goes here //error checking //copy the array holding primes from device to ~~compressed output~~ host cudaMemcpy(primes_h, primes_d, x *sizeof(int), cudaMemcpyDeviceToHost); //display the primes for(int i=0; i<x ; i++){ cout<<primes_h[i]<<endl; } //free allocated memory delete [] primes_h; cudaFree(primes_d); getchar(); }

~~virtual int getc_comp()~~ </pre>= 0;=== Almost Final version ====<pre># include <cmath> // This library enable the use of sqrt.

~~/* write a byte to compressed output */~~ # include <iostream>

~~virtual int putc_comp(int ch) = 0;~~ # include <ctime>

~~/* write a byte to expanded output */~~ #include<iomanip>

~~virtual int putc_out(int ch) = 0;~~ #include<cstdlib>

# include <cuda_runtime.h>

//#include <times.h>

/*

** This is the compression routine. The code should be a fairly close using namespace std;

** match to the algorithm accompanying the article.

** inline clock_t getMilliSecs() {

* return clock() /(CLOCKS_PER_SEC / 1000);

}

~~void compress()~~

{ __global__ void primegen(bool prime, int number2,int x,int *primes_d)

~~unsigned int next_code;~~ {

~~unsigned~~ int ~~character~~c = 0;

~~unsigned int string_code;~~

~~unsigned int index;~~

~~unsigned~~ for (int ~~bit_limit~~i = 1;i <= x; i++)

~~int i;~~ {

for ( int j = 2; j <= number2; j++)

{

~~ATLASSERT~~ if (~~code_value~~i!=j && i % j == 0 )~~; /* initialized? */~~

{

prime = false;

~~CUR_BITS = MIN_CODE_LEN~~ break;

~~bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;~~ }

~~output_bit_count=0;~~ }

~~output_bit_buffer=0L;~~ ~~ATLASSERT(256==FIRST_CODE);~~ ~~next_code=FIRST_CODE; /* Next code is the next available string code*/~~ ~~for (i=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */~~ ~~code_value[i]=-1;~~ ~~string_code=getc_src(); /* Get the first code */~~ ~~if(-1 == string_code)~~ ~~return; /* empty file or error */~~ /* ** This is the main loop where it all happens. This loop runs util all of ** the input has been exhausted. Note that it stops adding codes to the ** table after all of the possible codes have been defined. */ ~~while ((character=getc_src()) != -1)~~ { ~~index=find_match(string_code,character);/* See if the string is in */~~ ~~if (code_value[index] != -1) /* the table. If it is, */~~ ~~string_code=code_value[index]; /* get the code value. If */~~ ~~else /* the string is not in the*/~~ ~~{ /* table, try to add it. */~~ ~~if (next_code <= MAX_CODE)~~ { ~~code_value[index]=next_code++;~~ ~~prefix_code[index]=string_code;~~ ~~append_character[index]=character;~~ } ~~/* are we using enough bits to write out this code word? */~~ if(~~string_code >= bit_limit && CUR_BITS < BITS~~prime)

{

~~/* mark need for bigger code word with all ones */~~ primes_d[c]=i;

~~output_code(bit_limit)~~ c += 1;

~~CUR_BITS++;~~ }

~~bit_limit~~ prime = ~~(CURRENT_MAX_CODES(CUR_BITS) - 1)~~true;

}

~~ATLASSERT(string_code < bit_limit);~~

void primenum(long double); // Prototype...

~~output_code(string_code);~~ ~~/* When a string is found */~~

~~string_code=character; /* that is not in the table*/~~

~~} /* I output the last string*/~~int main()

~~} /* after adding the new one*/~~ {

/* long double x = 0;

** End of cout<<"\n This program will generate all prime numbers up to the ~~main loop~~"<<"\n number you have entered below...\n";

*/ cout<<"\n Please enter a number: ";

cin>> x;

cout<<"\n Here are all the prime numbers up to "<<x<<".\n";

~~output_code~~ primenum(~~string_code~~x); /~~* Output the last code *~~/function invocation...

~~output_code(-1);~~ /~~* This code flushes the output buffer*~~/cout<<endl<<"\nThere are "<<c

} //<<" prime numbers less than or equal to "<<x<<".\n\n";

return 0;

}

/*

** // This is function will determine the ~~hashing routine~~primenumbers up to num. ~~It tries to find a match for the prefix+char~~

** string in the string table. If it finds it, the index is returned. If void primenum(long double x)

** the string is not found, the first available index in the string table is {

** returned instead. bool prime = true;

* //struct tms start_time, stop_time;

int ~~find_match(unsigned int hash_prefix,unsigned int hash_character)~~number2;

{ number2 =(int) floor (sqrt (x));

~~int index~~ clock_t start = getMilliSecs();

~~int offset;~~ //Array to hold generated primes on host

int *primes_h = new int[(int)x];

//Device array to hold the primes on the device

~~index~~ int *primes_d = new int[(~~hash_character << HASHING_SHIFT~~int) ~~^ hash_prefix~~x];

~~if (index == 0)~~ //allocate device memory and initialize device memory

~~offset = 1~~ cudaMalloc((void**)&primes_d, (int)x * sizeof(int));

~~else~~ // cudaMalloc((void**)&c_d, sizeof(int));

~~offset = TABLE_SIZE - index~~ cudaMemset(&primes_d,0,x * sizeof(int));

~~while (1)~~ //error checking

{ cudaError_t error ;

~~if (code_value[index] == -1)~~ //Kernal goes here

~~return~~ primegen<<<1,1>>>(~~index~~prime,number2,(int)x,primes_d);

~~if (prefix_code[index] == hash_prefix &&~~

~~append_character[index] == hash_character)~~ // extract error code from the kernel's execution

~~return~~ error = cudaGetLastError(~~index~~);

~~index -~~ if (error != ~~offset;~~cudaSuccess) {

if cout << cudaGetErrorString(~~index~~ error) < 0)< endl;

~~index += TABLE_SIZE;~~ }

}

} //copy the array holding primes from device to host

error =cudaMemcpy(primes_h, primes_d, ((int)x) * sizeof(int), cudaMemcpyDeviceToHost);

/* if (error != cudaSuccess) {

** This is the expansion routine. It takes an LZW format file, and expands cout << cudaGetErrorString(error) << endl;

** it to an output file. The code here should be a fairly close match to }

** the algorithm in the accompanying article. // cudaMemcpy(c_h, c_d, sizeof(int), cudaMemcpyDeviceToHost);

* //display the primes

for(int i=0; i<(int)x ; i++){

if(primes_h[i]>=2 && primes_h[i]<=(int)x){

~~void expand()~~ cout<<primes_h[i]<<endl;

{ ~~unsigned int next_code;~~ }

~~unsigned int new_code;~~ }

~~unsigned int old_code~~ cout << "Elapsed time: " << (getMilliSecs() - start) << "ms" << endl;

~~int character~~ // cout<< "time: "<< (stop_s-start_s)/double(CLOCKS_PER_SEC)<<endl;

~~unsigned char *string;~~ //free allocated memory

~~unsigned int bit_limit;~~

delete [] primes_h;

cudaFree(primes_d);

~~ATLASSERT(code_value); /* initialized? */~~

~~CUR_BITS = MIN_CODE_LEN;~~ ~~bit_limit = CURRENT_MAX_CODES~~ getchar(~~CUR_BITS) - 1;~~ ~~input_bit_count=0;~~ ~~input_bit_buffer=0L;~~ ~~// @@@ what if we pass uncompressed file to decode?~~ ~~next_code=FIRST_CODE; /* This is the next available code to define */~~ ~~old_code=input_code(); /* Read in the first code, initialize the */~~ ~~if(-1 == old_code)~~ ~~return; /* read error? */~~ ~~character=old_code; /* character variable, and send the first */~~ ~~if(putc_out(old_code)==-1) /* code to the output file */~~ ~~return; /* write error */~~ /* ** This is the main expansion loop. It reads in characters from the LZW file ** until it sees the special code used to inidicate the end of the data. */ ~~while ((new_code=input_code()) != (-1))~~ { ~~/* look for code length increase marker */~~ ~~if(bit_limit == new_code && CUR_BITS < BITS)~~ { ~~CUR_BITS++;~~ ~~bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;~~ ~~new_code=input_code();~~ ~~ATLASSERT(new_code != -1); /* must be read error? */~~ ~~if(new_code == -1)~~ ~~break;~~ } ~~ATLASSERT(new_code < bit_limit);~~ /* ** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING ** case which generates an undefined code. It handles it by decoding ** the last code, and adding a single character to the end of the decode string. */ ~~if (new_code>=next_code)~~ { *decode_stack=character; ~~string=decode_string(decode_stack+1,old_code~~);

}

</pre>

/*=== Assignment 3 ======= Cuda Version:First Attempt ====

** Otherwise we do a straight decode of the new code. */ ~~else~~ ~~string=decode_string(decode_stack,new_code);~~ /* ** Now we output the decoded string in reverse order. */ ~~character=*string;~~ ~~while (string~~ <pre>~~= decode_stack)~~ ~~putc_out(*string--);~~ /* ** Finally, if possible, add a new code to the string table. */ ~~if (next_code <= MAX_CODE)~~ { ~~prefix_code[next_code]=old_code;~~ ~~append_character[next_code]=character;~~

~~next_code++;~~# include <cmath> // This library enable the use of sqrt.

}# include <iostream>

~~old_code=new_code;~~ # include <ctime>

} #include<iomanip>

} #include<cstdlib>

# include <cuda_runtime.h>

//#include <times.h>

/*

** This routine simply decodes a string from the string table, storing using namespace std;

** it in a buffer. The buffer can then be output in reverse order by

** the expansion program. inline clock_t getMilliSecs() {

*/ ~~/* ~nab: these char* aren't a risk for unicode; we are reading bytes */~~ ~~unsigned char *decode_string(unsigned char *buffer,unsigned int code)~~ { ~~int i;~~ ~~i=0;~~ ~~while (code >= FIRST_CODE)~~ { *buffer++ = append_character[code]; ~~code=prefix_code[code];~~ ~~i++;~~ ~~ATLASSERT(i < sizeof(decode_stack)); /* buffer overrun if it blows, increase stack size! */~~ } *buffer=code; returnclock(~~buffer~~); } /* ** The following two routines are used to output variable length ** codes. They are written strictly for clarity, and are not ** particularyl efficient. ~~~nab: there's room for improvement in these I/O functions eg work in DWORDS instead of bytes~~ */ ~~unsigned int input_code~~() { ~~int c;~~ ~~unsigned int return_value;~~ CLOCKS_PER_SEC /~~/static int input_bit_count=0;~~ ~~//static unsigned long input_bit_buffer=0L;~~ ~~while (input_bit_count <= 24)~~ { ~~if ((c = getc_comp()) == -1)~~ ~~break;~~ ~~input_bit_buffer |=~~ ~~(unsigned long) c << (24-input_bit_count);~~ ~~input_bit_count += 8;~~ } ~~if(input_bit_count < CUR_BITS) {~~ ~~ATLASSERT(!input_bit_buffer~~1000); ~~return -1; /* EOF */~~

}

~~return_value=input_bit_buffer >>~~ __global__ void primegen(~~32-CUR_BITS~~bool prime, int number2,int x,int *primes_d);

~~input_bit_buffer <<= CUR_BITS;~~ {

~~input_bit_count -~~ int c = ~~CUR_BITS~~0;

int idx = blockIdx.x * blockDim.x + threadIdx.x;

~~ATLASSERT~~ for (~~return_value~~ int i=1; i < ~~(1UL << CUR_BITS~~= x; i++));

~~return(return_value);~~ {

} if( i!= idx && i%idx == 0 )

{

prime = false;

~~/* bits are written outside normal byte boundaries, hence the need for keeping old values */~~ break;

~~void output_code(unsigned int code)~~ }

{

~~//static int output_bit_count=0;~~ if(prime)

~~//static unsigned long output_bit_buffer=0L;~~ {

primes_d[c]=i;

c += 1;

~~ATLASSERT(output_bit_count < 8); /* leftovers */~~ ~~ATLASSERT(CUR_BITS + output_bit_count <= 32);~~ ~~/*codes <256 are possible for single characters, zero bytes etc*/~~ ~~if(-1 == code) {~~ ~~/* pad remaining zeros and flush the last byte */~~ ~~if(output_bit_count) {~~ }

~~output_bit_buffer >>~~ prime = 24true; ~~ATLASSERT((output_bit_buffer & 0xFF) == output_bit_buffer);~~ ~~putc_comp(output_bit_buffer);~~ ~~output_bit_count = 0;~~ ~~output_bit_buffer = 0; /* in case some eejit calls us again */~~

}

~~return;~~ }

}

~~ATLASSERT~~/*for (~~code~~ int i = 1; i < ~~(1UL << CUR_BITS~~= x; i++));

{

for ( int j = 2; j <= number2; j++)

~~/* sends new bytes near the top (MSB) */~~ {

~~output_bit_buffer |~~ if ( i!=j && i % j == ~~(unsigned long) code << (32-CUR_BITS-output_bit_count~~0 );

~~output_bit_count += CUR_BITS;~~ {

~~while (output_bit_count >~~ prime = 8)false;

{ break;

~~/* no check for error but if there was a problem we'd know from the time we wrote the identifier */~~ }

~~putc_comp(output_bit_buffer >> 24);~~ }

~~output_bit_buffer <<= 8;~~ if (prime)

~~output_bit_count -= 8;~~ {

} primes_d[c]=i;

} c += 1;

}~~; /* CLZWImpl */~~

prime = true;

~~/* example derived class using C buffered I/O functions~~ } */

~~class CLZWCompressFile : public CLZWImpl {~~

~~public:~~

~~CLZWCompressFile() {~~

~~io_file = 0;~~

~~lzw_file = 0~~ void primenum(long double);// Prototype...

};

~~~CLZWCompressFile() {~~ ~~ATLASSERT(!io_file);~~ ~~ATLASSERT(!lzw_file);~~ }; ~~int AnyIOErrors() {return io_error; }~~ ~~// @@@ these char* should be changed for unicode builds~~ ~~unsigned~~ int ~~Compress~~main(~~char* input_file_name, char* to_name~~)

{

~~ATLASSERT(input_file_name && *input_file_name)~~ long double x = 0;

~~ATLASSERT(to_name && *to_name)~~ cout<<"\n This program will generate all prime numbers up to the"<<"\n number you have entered below...\n";

~~ATLASSERT(strcmp(to_name, input_file_name))~~ cout<<"\n Please enter a number: ";

cin>> x;

cout<<"\n Here are all the prime numbers up to "<<x<<".\n";

~~io_error = 1~~ primenum(x);//function invocation...

//cout<<endl<<"\nThere are "<<c

//<<" prime numbers less than or equal to "<<x<<".\n\n";

~~if(!code_value)~~ return 0;

~~if(!Init())~~ }

~~return 0; /* rare memory error */~~

// This function will determine the primenumbers up to num.

void primenum(long double x)

~~u_comp = 0;~~ ~~u_io = 0;~~ ~~io_file=fopen(input_file_name,"rb");~~ {

~~if(io_file) {~~ int n = x;

~~lzw_file=fopen(to_name,"wb")~~ int d;

~~if(lzw_file) {~~ bool prime = true;

/~~* write LZW identifier L+starting bytes *~~/struct tms start_time, stop_time;

~~putc('L', lzw_file)~~ int number2;

if number2 =(~~putc~~int) floor (~~MIN_CODE_LEN, lzw_file~~sqrt (x) ~~== MIN_CODE_LEN~~) {;

~~compress~~ clock_t start = getMilliSecs();

~~io_error = ferror(lzw_file) || ferror(io_file);~~

~~if(!io_error)~~ cudaDeviceProp prop;

~~ATLASSERT~~ cudaGetDevice(~~u_comp <= u_io~~&d); ~~/* this is bound to bomb every now and then, no compression! */~~

} cudaGetDeviceProperties(&prop, d);

~~fclose(lzw_file)~~ int nThreads = prop.maxThreadsDim[0];

~~lzw_file~~ int n_max = nThreads * prop.maxGridSize[0];

} if ( n> n_max) {

n = n_max;

cout << "n reduced to " << n << endl;

~~fclose(io_file);~~ }

~~io_file = 0;~~

} //Array to hold generated primes on host

int *primes_h = new int[(int)x];

~~return u_comp;~~ //Device array to hold the primes on the device

} int *primes_d = new int[(int)x];

//allocate device memory and initialize device memory

~~unsigned int Expand~~ cudaMalloc((~~char~~void* ~~lzw_name~~*)&primes_d, ~~char~~(int)x * ~~to_name~~sizeof(int));

{

~~ATLASSERT~~// cudaMalloc(~~lzw_name~~ (void**)&~~& *lzw_name~~c_d, sizeof(int));

~~ATLASSERT~~ cudaMemset(~~to_name &~~& primes_d,0,x *~~to_name~~sizeof(int));

~~ATLASSERT(strcmp(to_name, lzw_name));~~

//error checking

cudaError_t error ;

~~io_error = 1;~~

//Kernal goes here

primegen<<<(n + nThreads - 1) / nThreads, nThreads>>>(prime,number2,(int)x,primes_d);

~~if(!code_value)~~

~~if(!Init())~~ // extract error code from the kernel's execution

~~return 0; /* rare memory error */~~

error = cudaGetLastError();

if (error != cudaSuccess) {

~~u_comp = 0~~ cout << cudaGetErrorString(error) << endl;

~~u_io = 0;~~ }

~~lzw_file=fopen(lzw_name,"rb");~~

~~if(lzw_file) {~~ //copy the array holding primes from device to host

~~/* check LZW identifier L+starting bytes */~~

error =cudaMemcpy(primes_h, primes_d, ((int ~~ch1 = getc~~)x) * sizeof(~~lzw_file~~int), cudaMemcpyDeviceToHost);

~~int ch2 = getc(lzw_file);~~

if(~~'L'~~ error !=~~= ch1 && MIN_CODE_LEN==ch2~~cudaSuccess) {

~~io_file=fopen~~ cout << cudaGetErrorString(~~to_name,"wb"~~error)<< endl;

~~if(io_file) {~~ }

~~expand~~ // cudaMemcpy(c_h, c_d, sizeof(int), cudaMemcpyDeviceToHost);

~~io_error = ferror(lzw_file) || ferror(io_file);~~ //display the primes

for(int i=0; i<(int)x ; i++){

if(primes_h[i]>=2 && primes_h[i]<=(int)x){

~~fclose(io_file)~~ cout<<primes_h[i]<<endl; ~~io_file = 0;~~ }

}

cout << "Elapsed time: " << (getMilliSecs() - start) << "ms" << endl;

~~fclose~~ // cout<< "time: "<< (~~lzw_file~~stop_s-start_s)/double(CLOCKS_PER_SEC)<<endl;

~~lzw_file = 0;~~ //free allocated memory

}

delete [] primes_h;

cudaFree(primes_d);

~~return u_io;~~

} getchar();

}

</pre>

==== Conclusion: Logical Error ====

~~protected~~[[Image:gpuA3error.png|thumb|widthpx| ]]

~~/* -1 return indicates either EOF or~~ The prime number generated seems to have run into some IO logical error */. It does not generate the prime numbers correctly. Instead spits out all numbers.

~~virtual int getc_src() {~~==== Cuda Version: Attempt Two ====Gives a run time error "invalid argument". Logical error still persists.

~~ATLASSERT(io_file);~~==== Final Cuda version ====<pre>#include <cstdio>#include <cstdlib>#include <iostream>#include <ctime>#include <cuda_runtime.h>

~~int ch = getc(io_file)~~using namespace std;

/** * This macro checks return value of the CUDA runtime call and exits * the application ifthe call failed. */#define CUDA_CHECK_RETURN(~~EOF~~ value) { \ cudaError_t _m_cudaStat =value; \ if (_m_cudaStat != chcudaSuccess) { \ fprintf(stderr, "Error %s at line %d in file %s\n", \ cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ exit(1); \ } }

~~return -1~~/** * Kernel code to generate and detect primes */__global__ void prime(int *num, int blockNum, int threadNum, int size) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int bid = blockIdx.y * blockDim.y + threadIdx.y; __syncthreads();

/**

* Generate prime numbers and store them in the array.

* The first element is always 2

*/

if(tid == 0) {

num[tid] = 2;

} else {

num[tid] = 2 * tid + 1;

}

int tmp = bid * threadNum + tid;

~~u_io~~ int step1 = 2 * tmp +3; int step2 = tmp +1;

~~return ch~~ while(tmp < size) { int i = 1; /** * Check if an element is not prime, if it isn't set it to 0. */ while((step1 * i + step2) < size) { num[step1 * i + step2] = 0; i++; } tmp += blockNum * threadNum; __syncthreads(); }}

int main(int argc, char* argv[]) { if(argc != 2) { cout << "Incorrect no of arguments" << endl; return 1; } int n = atoi(argv[1]);

~~virtual~~ /** * variable declarations */ int ~~getc_comp() {~~*device; int host[n]; int d; cudaDeviceProp prop;

~~ATLASSERT~~ /** * Get the properties of the device in use */ cudaGetDevice(~~lzw_file~~&d); cudaGetDeviceProperties(&prop, d); int numberOfBlocks = 8; int maxThreadsPerBlock = prop.maxThreadsPerBlock; int numberOfThreads = maxThreadsPerBlock/numberOfBlocks;

~~int ch~~ /** * Start timer */ clock_t cb, ce; cb = ~~getc~~clock(~~lzw_file~~);

if /** * Allocate memory on the device */ CUDA_CHECK_RETURN(~~EOF == ch~~cudaMalloc((void**) &device, sizeof(int) * n));

~~return -1~~ /** * Call kernel with appropriate grid and thread size */ prime<<<numberOfBlocks, numberOfThreads>>>(device, numberOfBlocks, numberOfThreads, n);

/**

* Copy results back to host

*/

CUDA_CHECK_RETURN(cudaMemcpy(&host, device, sizeof(int) * n, cudaMemcpyDeviceToHost));

/**

* Free memory on device

*/

CUDA_CHECK_RETURN(cudaFree(device));

~~u_comp~~ /** * Output values */ for (int i = 0; i < n; i++) if (host[i] != 0) cout << host[i] << endl;

~~return ch;~~ /** * Stop timer } */ ~~virtual int putc_comp~~ ce = clock(~~int ch) {~~ ~~ATLASSERT(lzw_file~~); ~~ATLASSERT(ch >= 0 && ch~~ cout << "Prime generation - took " << ~~256);~~ ~~int ret = putc~~double(~~ch, lzw_file);~~ ~~if(ret != EOF) {~~ ~~ATLASSERT(ret == ch);~~ ~~u_comp++;~~ } ~~else~~ ~~ret =~~ ce -1; ~~return ret;~~ } ~~virtual int putc_out(int ch~~cb) { ~~ATLASSERT(io_file);~~ ~~ATLASSERT(ch >= 0 && ch~~ /CLOCKS_PER_SEC << " seconds" << ~~256);~~ ~~int ret = putc(ch, io_file);~~ ~~if(ret != EOF)~~ ~~u_io++;~~ ~~else~~ ~~ret = -1;~~ ~~return ret;~~ } ~~FILE* io_file;~~ ~~FILE *lzw_file~~endl; ~~int io_error;~~ ~~public:~~ ~~unsigned long u_io, u_comp; /* bytes read and written */~~ }; ~~// @@@ could have a generic one on IStream, CreateStreamOnHGlobal/SHCreateStreamOnFile~~ ~~#endif /* UPRIGHT_LZW_H */~~

</pre>

~~----~~[[Image:manualDelete.png|thumb|200px|Manual Delete Warning]]===== Final version's errors, warnings and observations =====* If a number over 515 is entered as the launch argument, the program will display random values at the end of the list of prime numbers* When attempting to delete the host array manually in the program, a warning is displayed[[Image:ManualCrash.png|thumb|200px|Manual Delete Crash]]* The program crashes at the end if the host array is manually deleted

=== ~~Assignment 2~~ ==Successful run of Prime generation ==~~== Assignment 3~~ ===[[Image:PrimeSuccessfulRun.png]]

Mohamed Baig

1

edit

Changes

DPS915/M-N-M

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

get involved with CDOT

courses

course projects

links

Tools