
Jump to: navigation, search


2,110 bytes removed, 16:55, 12 April 2013
Final version's errors, warnings and observations
=== Assignment 1 ===
==== Muhammad Ahsan: Prime Number Generator( 1,000,000,000 primes) ====
=== Assignment 1 ===
==== Nitin Prakash Panicker: LZW File Compression ====
Flat profile:
Each sample counts as 0.01 seconds.
% cumulative self self total
time seconds seconds calls ns/call ns/call name
99.46 48.19 48.19 CLZWCompressFile::Compress(char*, char*)
0.33 48.35 0.16 17122488 9.34 9.34 CLZWCompressFile::getc_src()
0.21 48.45 0.10 7095561 14.09 14.09 CLZWCompressFile::putc_comp(int)
=== lzw.cpp ===
 #include <time.h> #include "lzw.h"   /******************************************************************** ** ** This program gets a file name from the command line{{{ http://code. It compresses the ** file, placing its output in a file named testactivestate.lzw. It then expands ** test.lzw into test.out. Test.out should then be an exact duplicate of ** the input file. ** *************************************************************************com/recipes/576559/   main(int argc, char r2) *argv[]) { clock_t timer; CLZWCompressFile lzw;  /
Copyright (c) 2008 Florian Mayer
** Get the file name Permission is hereby granted, open it upfree of charge, to any person obtaining a copy of this software and open up associated documentation files (the lzw output file."Software"), to deal in the Software without restriction, including without limitation the rights* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is if (argc==1) furnished to do so, subject to the following conditions:
{ The above copyright notice and this permission notice shall be included in  printf("Input file name to compress?\n");  return 0;  }    printf("testing %s...\n", argv[1]); /* ** Compress all copies or substantial portions of the fileSoftware.
timer = clock();  int crunch = lzw.Compress(argv[1], "test.lzw");  timer = clock() - timer; //CLOCKS_PER_SEC  printf("compress time=%d ms, encoding=%d, size=%u", timer, lzw.get_bits(), crunch);  int filesize = lzw.u_io;  printf(" (ratio=%d%%)\n", filesize ? (filesize-crunch)*100/filesize : 0);  if(lzw.AnyIOErrors())  printf("***I/O ERROR***\n");   /* ** Expand the file. */  timer = clock();  int orig = lzw.Expand("test.lzw", "test.out");  timer = clock() - timer; //CLOCKS_PER_SEC  printf("expand time=%d ms, encoding=%d\n", timer, lzw.get_bits());  if(lzw.AnyIOErrors())  printf("***I/O ERROR***\n");    ATLASSERT(filesize == orig); // did we mangle the file?  return 0; }</pre> === lzw.h === <pre> #ifndef UPRIGHT_LZW_H #define UPRIGHT_LZW_H     /* LZW.h by N.A.Bozinis @ 19/01/2010 08:55:52  * ----------------------------------------------------------------------------------  *  * Plain C++ port of LZW compression algorithm and code originally (c) Mark R. Nelson  *  * Variable bit length encoding idea and code originally by Michael Dipperstein  *  *  * There are a lot of compression classes floating around but most are based on the  * zlib (zip/unzip) library, which is good but a bit of overkill for simple and small  * code. LZW combines decent compression ratios with very small code footprint. If  * you need something more powerful here are a few resources:  *  *  *  *  *  * Microsoft types can check the CAB protocol that is available in all windows:  *  *  *  */   #include <stdio.h> #include <stdlib.hiostream> #include <limits.hvector
#include <string.h>
#include <assert.h> #define ATLASSERT assert   /* NOTE: function and variable names left as much as possible matching the original  LZW.c by Mark, naturally bundled in classes to get rid of static/globals etc */   #define MIN_CODE_LEN 9 /* min # bits in a code word */ #define MAX_CODE_LEN 20 /* max # bits in a code word */ #define CURRENT_MAX_CODES(x) (1UL << (x))   #define FIRST_CODE (1 << CHAR_BIT) /* value of 1st string code */   #if (MIN_CODE_LEN <= CHAR_BIT) #error Code words must be larger than 1 character #endif   #if (MAX_CODE_LEN >= 25) #error Code words must fit in an integer #endif     /* VARIABLE BIT LENGTH ENCODING  * Instead of using a fixed number of bits for code words, we start at 9 (=MIN_CODE_LEN)  * and go up to BITS (<=MAX_CODE_LEN) so that small files are tightly packed and larger  * files are fine too. The BITS constant determines the maximum hash table size. For 18  * this means 250KB runtime table size which is enough for files ~4MB.  * There is no problem for files larger than that; if we run out of table space for new  * codes then the same codes are emitted (uncompressed obviously)  */   #define BITS 17 /* Setting the number of bits to 12, 13*/ #define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */ #define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */ #define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/  /* 14 bits are selected. */   #if BITS == 20  #define TABLE_SIZE 1048583 #elif BITS == 19  #define TABLE_SIZE 524309 #elif BITS == 18  #define TABLE_SIZE 262147 #elif BITS == 17  #define TABLE_SIZE 131101 #elif BITS == 16  #define TABLE_SIZE 65543 #elif BITS == 15  #define TABLE_SIZE 32797 #elif BITS == 14  #define TABLE_SIZE 18041 /* The string table size needs to be a */  /* prime number that is somewhat larger*/ #elif BITS == 13 /* than 2**BITS. */  #define TABLE_SIZE 9029 #elif BITS == 12  #define TABLE_SIZE 5021 #else #error define smaller or bigger table sizes #endif   #if (TABLE_SIZE <= MAX_VALUE) #error your prime numbers need attention #endif   #if (BITS > MAX_CODE_LEN) #error BITS can only go up to a maximum #endif     /*  This class does most of the job, except reading source and writing the compressed data  A derived class does that so that there's flexibility to read either from files or memory  */   class CLZWImpl { protected: int *code_value; /* This is the code value array */ unsigned int *prefix_code; /* This array holds the prefix codes */ unsigned char *append_character; /* This array holds the appended chars */ unsigned char decode_stack[4000]; /* This array holds the decoded string */ unsigned char CUR_BITS; /* ~nab: added for variable bit size */ /* we are processing bits but in the end of the day we do I/O in bytes */ int input_bit_count, output_bit_count; unsigned long input_bit_buffer, output_bit_buffer;   public: CLZWImpl() {  code_value = 0;  prefix_code = 0;  append_character = 0namespace std;
vector<unsigned long> get_primes(unsigned long max){
vector<unsigned long> primes;
char *sieve;
sieve = new char[max/8+1];
// Fill sieve with 1
memset(sieve, 0xFF, (max/8+1) * sizeof(char));
for(unsigned long x = 2; x <= max; x++)
if(sieve[x/8] & (0x01 << (x % 8))){
// Is prime. Mark multiplicates.
for(unsigned long j = 2*x; j <= max; j += x)
sieve[j/8] &= ~(0x01 << (j % 8));
delete[] sieve;
return primes;
  ~CLZWImplint main(void) { vector<unsigned long> primes; if primes = get_primes(code_value1000000000); // return 0; free(code_value) // Print out result. vector<unsigned long>::iterator it if for(it=primes.begin(prefix_code free; it < primes.end(prefix_code);it++) cout << *it << " "<<endl; if(append_character) free(append_character) cout << endl; return 0;
/** end of }}} */
==== Nitin Prakash Panicker: LZW File Compression ====
int get_bits() { return CUR_BITS; }<pre>
Flat profile:
int Init() {Each sample counts as 0.01 seconds.
ATLASSERT(!code_value); /* call just once */ % cumulative self self total
time seconds seconds calls ns/call ns/call name
99.46 48.19 48.19 CLZWCompressFile::Compress(char*, char*)
code_value= 0.33 48.35 0.16 17122488 9.34 9.34 CLZWCompressFile::getc_src(int*)malloc(TABLE_SIZE*sizeof(int));
prefix_code= 0.21 48.45 0.10 7095561 14.09 14.09 CLZWCompressFile::putc_comp(unsigned int*)malloc(TABLE_SIZE*sizeof(unsigned int));
append_character=(unsigned char*)malloc(TABLE_SIZE*sizeof(unsigned char));</pre>
==== Source Code for LZW File Compression ====
return code_value != 0 && prefix_code != 0 && append_character != 0;
=== Assignment 2 ===
==== Source code for prime number generator we will be putting on the gpu ====
# include <cmath> /* override these 4/ This library enable the use of sqrt. # include <iostream> using namespace std; void primenum(long double); // Prototype... int c = 0; int main(){ long double x = 0; cout<<"\n This program will generate all prime numbers up to the" <<"\n number you have entered below...\n"; cout<<"\n Please enter a number: read a byte from source *"; cin>> x; cout<<"\n Here are all the prime numbers up to "<<x<<".\n"; primenum(x); //function invocation... cout<<endl<<"\nThere are "<<c <<" prime numbers less than or equal to "<<x<<".\n\n"; return 0; } // This function will determine the primenumbers up to num. void primenum(long double x){ bool prime = true; //Calculates the square-root of 'x' int number2; number2 =(int) floor (sqrt (x)); for (int i = 1; i <= x; i++){ for ( int j = 2; j <= number2; j++){ if ( i!=j && i % j == 0 ){ prime = false; break; } } if (prime){ cout <<" "<<i<<" "; c += 1; } prime = true; } getchar(); }
virtual int getc_src() </pre>= 0;=== Version of prime generator running on GPU ====<pre>
# include <cmath> /* read / This library enable the use of sqrt. # include <iostream> # include <cuda_runtime.h> using namespace std; void primenum(long double); // Prototype... int c = 0; int main(){ long double x = 0; cout<<"\n This program will generate all prime numbers up to the" <<"\n number you have entered below...\n"; cout<<"\n Please enter a byte from compressed source number: "; cin>> x; cout<<"\n Here are all the prime numbers up to "<<x<<".\n"; primenum(x); //function invocation... cout<<endl<<"\nThere are "<<c <<" prime numbers less than or equal to "<<x<<".\n\n"; return 0; } // This function will determine the primenumbers up to num. void primenum(during expansionlong double x) { //Array to hold generated primes on host int *primes_h = new int[x]; //Device array to hold the primes on the device int *primes_d = new int[x]; //allocate device memory and write initialize device memory cudaMalloc((void**)&primes_d, x * sizeof(int)); cudaMemset(&primes_d,sizeof(int),x * sizeof(int); //Kernal goes here //error checking //copy the array holding primes from device to compressed output host cudaMemcpy(primes_h, primes_d, x *sizeof(int), cudaMemcpyDeviceToHost); //display the primes for(int i=0; i<x ; i++){ cout<<primes_h[i]<<endl; } //free allocated memory delete [] primes_h; cudaFree(primes_d); getchar(); }
virtual int getc_comp() </pre>= 0;=== Almost Final version ====<pre># include <cmath> // This library enable the use of sqrt.
/* write a byte to compressed output */ # include <iostream>
virtual int putc_comp(int ch) = 0; # include <ctime>
/* write a byte to expanded output */ #include<iomanip>
virtual int putc_out(int ch) = 0; #include<cstdlib>
# include <cuda_runtime.h>
//#include <times.h>
** This is the compression routine. The code should be a fairly close using namespace std;
** match to the algorithm accompanying the article.
** inline clock_t getMilliSecs() {
* return clock() /(CLOCKS_PER_SEC / 1000);
void compress()
{ __global__ void primegen(bool prime, int number2,int x,int *primes_d)
unsigned int next_code; {
unsigned int characterc = 0;
unsigned int string_code;
unsigned int index;
unsigned for (int bit_limiti = 1;i <= x; i++)
int i; {
for ( int j = 2; j <= number2; j++)
ATLASSERT(code_value); /* initialized? */    CUR_BITS = MIN_CODE_LEN;  bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;  output_bit_count=0;  output_bit_buffer=0L;    ATLASSERT(256==FIRST_CODE);  next_code=FIRST_CODE; /* Next code is the next available string code*/  for if (i!=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */  code_value[j && i]% j =-1;    string_code=getc_src(); /* Get the first code */  if(-1 == string_code)  return; /* empty file or error */   /* ** This is the main loop where it all happens. This loop runs util all of ** the input has been exhausted. Note that it stops adding codes to the ** table after all of the possible codes have been defined. */  while ((character=getc_src()) != -1)  {  index=find_match(string_code,character);/* See if the string is in */  if (code_value[index] != -1) /* the table. If it is, */  string_code=code_value[index]; /* get the code value. If */  else /* the string is not in the*/  { /* table, try to add it. */  if (next_code <= MAX_CODE0 {
code_value[index]=next_code++; {
prefix_code[index] prime =string_codefalse;
append_character[index]=character break;
  /* are we using enough bits to write out this code word? */  if(string_code >= bit_limit && CUR_BITS < BITSprime)
/* mark need for bigger code word with all ones */ primes_d[c]=i;
output_code(bit_limit) c += 1;
bit_limit prime = (CURRENT_MAX_CODES(CUR_BITS) - 1)true;
ATLASSERT(string_code < bit_limit);
void primenum(long double); // Prototype...
output_code(string_code); /* When a string is found */
string_code=character; /* that is not in the table*/
} /* I output the last string*/int main()
} /* after adding the new one*/ /* {
** End of the main loop. long double x = 0;
*/ cout<<"\n This program will generate all prime numbers up to the"<<"\n number you have entered below...\n";
cout<<"\n Please enter a number: ";
cin>> x;
output_code(string_code) cout<<"\n Here are all the prime numbers up to "<<x<<".\n"; /* Output the last code */
output_code primenum(-1x); /* This code flushes the output buffer*/function invocation...
} //cout<<endl<<"\nThere are "<<c
//<<" prime numbers less than or equal to "<<x<<".\n\n";
return 0;
/* }
** This is the hashing routine. It tries to find a match for the prefix+char
** string in // This function will determine the string tableprimenumbers up to num. If it finds it, the index is returned. If
** the string is not found, the first available index in the string table is void primenum(long double x)
** returned instead. {
*/ bool prime = true;
int find_match(unsigned int hash_prefix //struct tms start_time,unsigned int hash_character)stop_time;
{ int number2;
number2 =(int index) floor (sqrt (x));
int offset clock_t start = getMilliSecs();
//Array to hold generated primes on host
int *primes_h = new int[(int)x];
index = (hash_character << HASHING_SHIFT) ^ hash_prefix; //Device array to hold the primes on the device
if int *primes_d = new int[(index == 0int)x];
offset = 1; //allocate device memory and initialize device memory
else cudaMalloc((void**)&primes_d, (int)x * sizeof(int));
offset = TABLE_SIZE - index // cudaMalloc((void**)&c_d, sizeof(int));
while cudaMemset(1&primes_d,0,x * sizeof(int));
{ //error checking
if (code_value[index] == -1) cudaError_t error ;
return(index); //Kernal goes here
if primegen<<<1,1>>>(prefix_code[index] == hash_prefix &&prime,number2,(int)x,primes_d);
append_character[index] == hash_character)
return(index); // extract error code from the kernel's execution
index - error = offsetcudaGetLastError();
if (index < 0error != cudaSuccess){
index += TABLE_SIZE cout << cudaGetErrorString(error) << endl;
//copy the array holding primes from device to host
error =cudaMemcpy(primes_h, primes_d, ((int)x) * sizeof(int), cudaMemcpyDeviceToHost);
** This is the expansion routine. It takes an LZW format file, and expands if (error != cudaSuccess) {
** it to an output file. The code here should be a fairly close match to cout << cudaGetErrorString(error) << endl;
** the algorithm in the accompanying article. }
* // cudaMemcpy(c_h, c_d, sizeof(int), cudaMemcpyDeviceToHost);
//display the primes
for(int i=0; i<(int)x ; i++){
void expand if(primes_h[i]>=2 && primes_h[i]<=(int)x){
{ cout<<primes_h[i]<<endl;
unsigned int next_code; }
unsigned int new_code; }
unsigned int old_code cout << "Elapsed time: " << (getMilliSecs() - start) << "ms" << endl;
int character // cout<< "time: "<< (stop_s-start_s)/double(CLOCKS_PER_SEC)<<endl;
unsigned char *string; //free allocated memory
unsigned int bit_limit;
delete [] primes_h;
ATLASSERT(code_value); /* initialized? */
   CUR_BITS = MIN_CODE_LEN;  bit_limit = CURRENT_MAX_CODES getchar(CUR_BITS) - 1;  input_bit_count=0;  input_bit_buffer=0L;   // @@@ what if we pass uncompressed file to decode?    next_code=FIRST_CODE; /* This is the next available code to define */    old_code=input_code(); /* Read in the first code, initialize the */  if(-1 == old_code)  return; /* read error? */  character=old_code; /* character variable, and send the first */  if(putc_out(old_code)==-1) /* code to the output file */  return; /* write error */ /* ** This is the main expansion loop. It reads in characters from the LZW file ** until it sees the special code used to inidicate the end of the data. */  while ((new_code=input_code()) != (-1))  {  /* look for code length increase marker */  if(bit_limit == new_code && CUR_BITS < BITS)  {  CUR_BITS++;  bit_limit = CURRENT_MAX_CODES(CUR_BITS) - 1;    new_code=input_code();  ATLASSERT(new_code != -1); /* must be read error? */  if(new_code == -1)  break;  }    ATLASSERT(new_code < bit_limit);   /* ** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING ** case which generates an undefined code. It handles it by decoding ** the last code, and adding a single character to the end of the decode string. */  if (new_code>=next_code)  {  *decode_stack=character;  string=decode_string(decode_stack+1,old_code);
/*=== Assignment 3 ======= Cuda Version:First Attempt ====
** Otherwise we do a straight decode of the new code. */  else  string=decode_string(decode_stack,new_code); /* ** Now we output the decoded string in reverse order. */  character=*string;  while (string >= decode_stack)  putc_out(*string--); /* ** Finally, if possible, add a new code to the string table. */  if (next_code <= MAX_CODE)  {  prefix_code[next_code]=old_code;  append_character[next_code]=character;  next_code++;  }  old_code=new_code;  } }   /* ** This routine simply decodes a string from the string table, storing ** it in a buffer. The buffer can then be output in reverse order by ** the expansion program. */ /* ~nab: these char* aren't a risk for unicode; we are reading bytes */ unsigned char *decode_string(unsigned char *buffer,unsigned int code) { int i;    i=0;  while (code pre>= FIRST_CODE)  {  *buffer++ = append_character[code];  code=prefix_code[code];  i++;  ATLASSERT(i < sizeof(decode_stack)); /* buffer overrun if it blows, increase stack size! */  }  *buffer=code;  return(buffer); }   /* ** The following two routines are used to output variable length ** codes. They are written strictly for clarity, and are not ** particularyl efficient.    ~nab: there's room for improvement in these I/O functions eg work in DWORDS instead of bytes */   unsigned int input_code() { int c; unsigned int return_value; //static int input_bit_count=0;
# include <cmath> //static unsigned long input_bit_buffer=0L;This library enable the use of sqrt.
# include <iostream>
# include <ctime>
while (input_bit_count #include<= 24)iomanip>
{ #include<cstdlib>
if ((c = getc_comp()) == -1) # include <cuda_runtime.h>
break; //#include <times.h>
using namespace std;
input_bit_buffer |=
inline clock_t getMilliSecs(unsigned long) c << (24-input_bit_count);{
input_bit_count += 8;  }    if return clock(input_bit_count < CUR_BITS) {  ATLASSERT/ (!input_bit_bufferCLOCKS_PER_SEC / 1000);  return -1; /* EOF */
return_value=input_bit_buffer >> __global__ void primegen(32-CUR_BITSbool prime, int number2,int x,int *primes_d);
input_bit_buffer <<= CUR_BITS; {
input_bit_count - int c = CUR_BITS0;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
ATLASSERT for (return_value int i=1; i < (1UL << CUR_BITS= x; i++));
return(return_value); {
} if( i!= idx && i%idx == 0 )
prime = false;
/* bits are written outside normal byte boundaries, hence the need for keeping old values */ break;
void output_code(unsigned int code) }
//static int output_bit_count=0; if(prime)
//static unsigned long output_bit_buffer=0L; {
c += 1;
ATLASSERT(output_bit_count < 8); /* leftovers */  ATLASSERT(CUR_BITS + output_bit_count <= 32);  /*codes <256 are possible for single characters, zero bytes etc*/    if(-1 == code) {  /* pad remaining zeros and flush the last byte */  if(output_bit_count) { }
output_bit_buffer >> prime = 24true ATLASSERT((output_bit_buffer & 0xFF) == output_bit_buffer);  putc_comp(output_bit_buffer);    output_bit_count = 0;  output_bit_buffer = 0; /* in case some eejit calls us again */
return; }
ATLASSERT/*for (code int i = 1; i < (1UL << CUR_BITS= x; i++));
for ( int j = 2; j <= number2; j++)
/* sends new bytes near the top (MSB) */ {
output_bit_buffer | if ( i!=j && i % j == (unsigned long) code << (32-CUR_BITS-output_bit_count0 );
output_bit_count += CUR_BITS; {
while (output_bit_count > prime = 8)false;
{ break;
/* no check for error but if there was a problem we'd know from the time we wrote the identifier */ }
putc_comp(output_bit_buffer >> 24); }
output_bit_buffer <<= 8; if (prime)
output_bit_count -= 8; {
} primes_d[c]=i;
} c += 1;
}; /* CLZWImpl */
prime = true;
/* example derived class using C buffered I/O functions } */
class CLZWCompressFile : public CLZWImpl {
CLZWCompressFile() {
io_file = 0;
lzw_file = 0 void primenum(long double);// Prototype...
~CLZWCompressFile() {  ATLASSERT(!io_file);  ATLASSERT(!lzw_file);  };    int AnyIOErrors() {return io_error; }    // @@@ these char* should be changed for unicode builds  unsigned int Compressmain(char* input_file_name, char* to_name)
ATLASSERT(input_file_name && *input_file_name) long double x = 0;
ATLASSERT(to_name && *to_name) cout<<"\n This program will generate all prime numbers up to the"<<"\n number you have entered below...\n";
ATLASSERT(strcmp(to_name, input_file_name)) cout<<"\n Please enter a number: ";
cin>> x;
cout<<"\n Here are all the prime numbers up to "<<x<<".\n";
io_error = 1 primenum(x);//function invocation...
//cout<<endl<<"\nThere are "<<c
//<<" prime numbers less than or equal to "<<x<<".\n\n";
if(!code_value) return 0;
if(!Init()) }
return 0; /* rare memory error */
// This function will determine the primenumbers up to num.
void primenum(long double x)
u_comp = 0; {
u_io int n = 0x;
io_file=fopen(input_file_name,"rb") int d;
if(io_file) {bool prime = true;
lzw_file=fopen(to_name //struct tms start_time,"wb")stop_time;
if(lzw_file) { int number2;
/* write LZW identifier L+starting bytes */ number2 =(int) floor (sqrt (x));
putc clock_t start = getMilliSecs('L', lzw_file);
if(putc(MIN_CODE_LEN, lzw_file) == MIN_CODE_LEN) {
compress() cudaDeviceProp prop;
io_error = ferror cudaGetDevice(lzw_file) || ferror(io_file&d);
if cudaGetDeviceProperties(!io_error&prop, d);
ATLASSERT(u_comp < int nThreads = u_io)prop.maxThreadsDim[0]; /* this is bound to bomb every now and then, no compression! */
} int n_max = nThreads * prop.maxGridSize[0];
fclose if (lzw_filen> n_max);{
lzw_file n = 0n_max;
} cout << "n reduced to " << n << endl;
fclose(io_file); //Array to hold generated primes on host
io_file int *primes_h = 0new int[(int)x];
//Device array to hold the primes on the device
int *primes_d = new int[(int)x];
return u_comp;
} //allocate device memory and initialize device memory
cudaMalloc((void**)&primes_d, (int)x * sizeof(int));
unsigned int Expand // cudaMalloc((charvoid** lzw_name)&c_d, char* to_namesizeof(int));
{ cudaMemset(&primes_d,0,x * sizeof(int));
ATLASSERT(lzw_name && *lzw_name);
ATLASSERT(to_name && *to_name); //error checking
ATLASSERT(strcmp(to_name, lzw_name)) cudaError_t error ;
//Kernal goes here
io_error = primegen<<<(n + nThreads - 1) / nThreads, nThreads>>>(prime,number2,(int)x,primes_d);
// extract error code from the kernel's execution
if error = cudaGetLastError(!Init());
return 0; /* rare memory if (error */!= cudaSuccess) {
cout << cudaGetErrorString(error) << endl;
u_comp = 0;
u_io = 0; //copy the array holding primes from device to host
if error =cudaMemcpy(lzw_fileprimes_h, primes_d, ((int)x) * sizeof(int), cudaMemcpyDeviceToHost) {;
/* check LZW identifier L+starting bytes */
int ch1 if (error != getc(lzw_filecudaSuccess);{
int ch2 = getc cout << cudaGetErrorString(lzw_fileerror)<< endl;
if('L' == ch1 && MIN_CODE_LEN==ch2) { }
io_file=fopen // cudaMemcpy(to_namec_h,"wb"c_d, sizeof(int), cudaMemcpyDeviceToHost);
if(io_file) { //display the primes
expand for(int i=0; i<(int)x ;i++){
io_error if(primes_h[i]>=2 && primes_h[i]<= ferror(lzw_fileint) || ferror(io_filex);{
   fclose(io_file) cout<<primes_h[i]<<endl io_file = 0;  }
cout << "Elapsed time: " << (getMilliSecs() - start) << "ms" << endl;
fclose // cout<< "time: "<< (lzw_filestop_s-start_s)/double(CLOCKS_PER_SEC)<<endl;
lzw_file = 0; //free allocated memory
delete [] primes_h;
return u_io;
} getchar();
==== Conclusion: Logical Error ====
protected[[Image:gpuA3error.png|thumb|widthpx| ]]
/* -1 return indicates either EOF or The prime number generated seems to have run into some IO logical error */. It does not generate the prime numbers correctly. Instead spits out all numbers.
virtual int getc_src() {==== Cuda Version: Attempt Two ====Gives a run time error "invalid argument". Logical error still persists.
ATLASSERT(io_file);==== Final Cuda version ====<pre>#include <cstdio>#include <cstdlib>#include <iostream>#include <ctime>#include <cuda_runtime.h>
int ch = getc(io_file)using namespace std;
/** * This macro checks return value of the CUDA runtime call and exits * the application ifthe call failed. */#define CUDA_CHECK_RETURN(EOF value) { \ cudaError_t _m_cudaStat =value; \ if (_m_cudaStat != chcudaSuccess) { \ fprintf(stderr, "Error %s at line %d in file %s\n", \ cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ exit(1); \ } }
return -1/** * Kernel code to generate and detect primes */__global__ void prime(int *num, int blockNum, int threadNum, int size) { const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int bid = blockIdx.y * blockDim.y + threadIdx.y; __syncthreads();
* Generate prime numbers and store them in the array.
* The first element is always 2
if(tid == 0) {
num[tid] = 2;
} else {
num[tid] = 2 * tid + 1;
int tmp = bid * threadNum + tid;
u_io int step1 = 2 * tmp +3; int step2 = tmp +1;
return ch while(tmp < size) { int i = 1; /** * Check if an element is not prime, if it isn't set it to 0. */ while((step1 * i + step2) < size) { num[step1 * i + step2] = 0; i++; } tmp += blockNum * threadNum; __syncthreads(); }}
int main(int argc, char* argv[]) { if(argc != 2) { cout << "Incorrect no of arguments" << endl; return 1; } int n = atoi(argv[1]);
virtual /** * variable declarations */ int getc_comp() {*device; int host[n]; int d; cudaDeviceProp prop;
ATLASSERT /** * Get the properties of the device in use */ cudaGetDevice(lzw_file&d); cudaGetDeviceProperties(&prop, d); int numberOfBlocks = 8; int maxThreadsPerBlock = prop.maxThreadsPerBlock; int numberOfThreads = maxThreadsPerBlock/numberOfBlocks;
int ch /** * Start timer */ clock_t cb, ce; cb = getcclock(lzw_file);
if /** * Allocate memory on the device */ CUDA_CHECK_RETURN(EOF == chcudaMalloc((void**) &device, sizeof(int) * n));
return -1 /** * Call kernel with appropriate grid and thread size */ prime<<<numberOfBlocks, numberOfThreads>>>(device, numberOfBlocks, numberOfThreads, n);
* Copy results back to host
CUDA_CHECK_RETURN(cudaMemcpy(&host, device, sizeof(int) * n, cudaMemcpyDeviceToHost));
* Free memory on device
u_comp /** * Output values */ for (int i = 0; i < n; i++) if (host[i] != 0) cout << host[i] << endl;
return ch; /** * Stop timer } */  virtual int putc_comp ce = clock(int ch) {  ATLASSERT(lzw_file);  ATLASSERT(ch >= 0 && ch cout << "Prime generation - took " << 256);  int ret = putcdouble(ch, lzw_file);    if(ret != EOF) {  ATLASSERT(ret == ch);  u_comp++;  }  else  ret = ce -1;    return ret;  }  virtual int putc_out(int chcb) {  ATLASSERT(io_file);  ATLASSERT(ch >= 0 && ch /CLOCKS_PER_SEC << " seconds" << 256);  int ret = putc(ch, io_file);    if(ret != EOF)  u_io++;  else  ret = -1;    return ret;  }    FILE* io_file;  FILE *lzw_fileendl int io_error; public:  unsigned long u_io, u_comp; /* bytes read and written */ };   // @@@ could have a generic one on IStream, CreateStreamOnHGlobal/SHCreateStreamOnFile   #endif /* UPRIGHT_LZW_H */
----[[Image:manualDelete.png|thumb|200px|Manual Delete Warning]]===== Final version's errors, warnings and observations =====* If a number over 515 is entered as the launch argument, the program will display random values at the end of the list of prime numbers* When attempting to delete the host array manually in the program, a warning is displayed[[Image:ManualCrash.png|thumb|200px|Manual Delete Crash]]* The program crashes at the end if the host array is manually deleted
=== Assignment 2 ==Successful run of Prime generation ==== Assignment 3 ===[[Image:PrimeSuccessfulRun.png]]

Navigation menu