Difference between revisions of "GPU610/DPS915 MCM Decrypt"

From CDOT Wiki
Jump to: navigation, search
(Kernel Attempts)
(Assignment 3)
 
(2 intermediate revisions by the same user not shown)
Line 250: Line 250:
  
 
-Currently I am trying to get the above kernels to work before handing in the assignment as I feel that having just the initialization kernel would not be nearly sufficient for the purpose of this assignment.
 
-Currently I am trying to get the above kernels to work before handing in the assignment as I feel that having just the initialization kernel would not be nearly sufficient for the purpose of this assignment.
 +
 +
==== Work With Prefix Scan ====
 +
 +
-At the moment I am working on various simplified versions of a prefix sum algorithm that I am hoping will lead me on the right path to completing my assignment. These algorithms have been gathered from various sources such as MIT, NVIDIA, as well as CUDA documentation.
 +
 +
-Below is a sequential prescan used to perform a prescan on an array.
 +
 +
<source lang="cpp">
 +
 +
void scan( float* arr1, float* input, int n) {
 +
  output[0] = 0; // since this is an exclusive scan, we do not include the first element
 +
  for(int i = 1; i < length; ++i) {
 +
      arr1[i] = input[i-1] + arr1[i-1];
 +
  }
 +
}
 +
 +
</source>
 +
 +
-Below is a parallel scan which does the same thing as the above function
 +
 +
<source lang="cpp">
 +
 +
global__ void scan(float *g_odata, float *g_idata, int n) {
 +
  extern __shared__ float temp[]; // allocated on invocation
 +
 +
  int thid = threadIdx.x;
 +
  int pout = 0, pin = 1;
 +
 +
  // load input into shared memory.
 +
  // This is exclusive scan, so shift right by one and set first elt to 0
 +
  temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
 +
  __syncthreads();
 +
 +
  for (int offset = 1; offset < n; offset *= 2) {
 +
      pout = 1 - pout; // swap double buffer indices
 +
      pin = 1 - pout;
 +
 +
      if (thid >= offset)
 +
        temp[pout*n+thid] += temp[pin*n+thid - offset];
 +
      else
 +
      temp[pout*n+thid] = temp[pin*n+thid];
 +
 +
      __syncthreads();
 +
  }
 +
 +
  g_odata[thid] = temp[pout*n+thid1]; // write output
 +
}
 +
 +
</source>
  
 
=== Assignment 3 ===
 
=== Assignment 3 ===
 +
-In the end I found a fairly simplified encryption program and decided to work with that so I could at least get something to hand in in the end. Encryption works with every letter of the string or file you are working with, and therefore is a perfect candidate to be parallelized.
 +
-Here is the source code snippit of the CPU code:
 +
 +
<source lang=cpp>
 +
void encrypt(char *inp,char *out,int key) 
 +
 +
    std::ifstream input; 
 +
    std::ofstream output; 
 +
    char buf; 
 +
    input.open(inp); 
 +
    output.open(out); 
 +
    buf=input.get(); 
 +
   
 +
    while(!input.eof()) 
 +
    { 
 +
        if(buf>='a'&&buf<='z') { 
 +
            buf-='a'; 
 +
            buf+=key; 
 +
            buf%=26; 
 +
            buf+='A'; 
 +
        } 
 +
        else if(buf>='A'&&buf<='Z') {
 +
            buf-='A'; 
 +
            buf+=26-key; 
 +
            buf%=26; 
 +
            buf+='a';
 +
        }
 +
        output.put(buf); 
 +
        buf=input.get(); 
 +
    } 
 +
    input.close(); 
 +
    output.close(); 
 +
    //readText(inp); 
 +
    //readText(out); 
 +
}
 +
 +
</source>
 +
 +
-I then created a kernel which was very simple
 +
 +
<source lang=cpp>
 +
 +
__global__ void encrypt2(char *inp, int key) {
 +
    int i = threadIdx.x;
 +
    if(inp[i]>='a'&&inp[i]<='z') { 
 +
        inp[i]-='a'; 
 +
        inp[i]+=key; 
 +
        inp[i]%=26; 
 +
        inp[i]+='A'; 
 +
    } 
 +
    else if(inp[i]>='A'&&inp[i]<='Z') {
 +
        inp[i]-='A'; 
 +
        inp[i]+=26-key; 
 +
        inp[i]%=26; 
 +
        inp[i]+='a';
 +
    }
 +
}
 +
 +
</source>
 +
 +
-and optimized it to allow for larger sized strings (so all of the threads would not be operating on just one block and leaving the other streaming multiprocessors idle.
 +
 +
<source lang=cpp>
 +
 +
__global__ void encrypt2(char *inp, int key) {
 +
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 +
    if(inp[i]>='a'&&inp[i]<='z') { 
 +
        inp[i]-='a'; 
 +
        inp[i]+=key; 
 +
        inp[i]%=26; 
 +
        inp[i]+='A'; 
 +
    } 
 +
    else if(inp[i]>='A'&&inp[i]<='Z') {
 +
        inp[i]-='A'; 
 +
        inp[i]+=26-key; 
 +
        inp[i]%=26; 
 +
        inp[i]+='a';
 +
    }
 +
}
 +
</source>

Latest revision as of 17:58, 6 December 2013

MySQL Decrypt

Team Members

  1. Matt MacEachern

eMail All

Progress

Assignment 1

Topic

I have always been interested in computer security, and especially interested in the algorithms used encrypt the passwords that are stored in various databases. After browsing around multiple open source code repositories I found a relatively simplistic C program (which I am planning on converting to C++ at a later date) that is used to decrypt MYSQL passwords given the MYSQL hash of the password.

Required Files

https://drive.google.com/#folders/0B63ZcmSNsUsqX3MyTGNSeFBuSlU

Compilation And Running

Below you will see a compilation and example run of the application with the hash 29bad1457ee5e49e (which equals the password pass (a small password so I could test the application in a short period of time). Mysql compilation.png

Output

[ 500000 ] 0x4e 0x42 0x5c (NB\)
[ 1000000 ] 0x21 0x65 0x3d 0x21 (!e=!)
[ 1500000 ] 0x4f 0x2c 0x7a 0x21 (O,z!)
[ 2000000 ] 0x22 0x4f 0x5b 0x22 ("O[")
[ 2500000 ] 0x50 0x71 0x3c 0x23 (Pq<#)
[ 3000000 ] 0x23 0x39 0x79 0x23 (#9y#)
[ 3500000 ] 0x51 0x5b 0x5a 0x24 (Q[Z$)
[ 4000000 ] 0x24 0x23 0x3c 0x25 ($#<%)
[ 4500000 ] 0x52 0x45 0x78 0x25 (REx%)
[ 5000000 ] 0x25 0x68 0x59 0x26 (%hY&)
[ 5500000 ] 0x53 0x2f 0x3b 0x27 (S/;')
[ 6000000 ] 0x26 0x52 0x77 0x27 (&Rw')
[ 6500000 ] 0x54 0x74 0x58 0x28 (TtX()
[ 7000000 ] 0x27 0x3c 0x3a 0x29 ('<:))
[ 7500000 ] 0x55 0x5e 0x76 0x29 (U^v))
[ 8000000 ] 0x28 0x26 0x58 0x2a ((&X*)
[ 8500000 ] 0x56 0x48 0x39 0x2b (VH9+)
[ 9000000 ] 0x29 0x6b 0x75 0x2b ()ku+)
[ 9500000 ] 0x57 0x32 0x57 0x2c (W2W,)
[ 10000000 ] 0x2a 0x55 0x38 0x2d (*U8-)
[ 10500000 ] 0x58 0x77 0x74 0x2d (Xwt-)
[ 11000000 ] 0x2b 0x3f 0x56 0x2e (+?V.)
[ 11500000 ] 0x59 0x61 0x37 0x2f (Ya7/)
[ 12000000 ] 0x2c 0x29 0x74 0x2f (,)t/)
[ 12500000 ] 0x5a 0x4b 0x55 0x30 (ZKU0)
[ 13000000 ] 0x2d 0x6e 0x36 0x31 (-n61)
[ 13500000 ] 0x5b 0x35 0x73 0x31 ([5s1)
[ 14000000 ] 0x2e 0x58 0x54 0x32 (.XT2)
[ 14500000 ] 0x5c 0x7a 0x35 0x33 (\z53)
[ 15000000 ] 0x2f 0x42 0x72 0x33 (/Br3)
[ 15500000 ] 0x5d 0x64 0x53 0x34 (]dS4)
[ 16000000 ] 0x30 0x2c 0x35 0x35 (0,55)
[ 16500000 ] 0x5e 0x4e 0x71 0x35 (^Nq5)
[ 17000000 ] 0x31 0x71 0x52 0x36 (1qR6)
[ 17500000 ] 0x5f 0x38 0x34 0x37 (_847)
[ 18000000 ] 0x32 0x5b 0x70 0x37 (2[p7)
[ 18500000 ] 0x60 0x22 0x52 0x38 (`"R8)
[ 19000000 ] 0x33 0x45 0x33 0x39 (3E39)
[ 19500000 ] 0x61 0x67 0x6f 0x39 (ago9)
[ 20000000 ] 0x34 0x2f 0x51 0x3a (4/Q:)
[ 20500000 ] 0x62 0x51 0x32 0x3b (bQ2;)
[ 21000000 ] 0x35 0x74 0x6e 0x3b (5tn;)
[ 21500000 ] 0x63 0x3b 0x50 0x3c (c;P<)
[ 22000000 ] 0x36 0x5e 0x31 0x3d (6^1=)
[ 22500000 ] 0x64 0x25 0x6e 0x3d (d%n=)
[ 23000000 ] 0x37 0x48 0x4f 0x3e (7HO>)
[ 23500000 ] 0x65 0x6a 0x30 0x3f (ej0?)
[ 24000000 ] 0x38 0x32 0x6d 0x3f (82m?)
[ 24500000 ] 0x66 0x54 0x4e 0x40 (fTN@)
[ 25000000 ] 0x39 0x77 0x2f 0x41 (9w/A)
[ 25500000 ] 0x67 0x3e 0x6c 0x41 (g>lA)
[ 26000000 ] 0x3a 0x61 0x4d 0x42 (:aMB)
[ 26500000 ] 0x68 0x28 0x2f 0x43 (h(/C)
[ 27000000 ] 0x3b 0x4b 0x6b 0x43 (;KkC)
[ 27500000 ] 0x69 0x6d 0x4c 0x44 (imLD)
[ 28000000 ] 0x3c 0x35 0x2e 0x45 (<5.E)
[ 28500000 ] 0x6a 0x57 0x6a 0x45 (jWjE)
[ 29000000 ] 0x3d 0x7a 0x4b 0x46 (=zKF)
[ 29500000 ] 0x6b 0x41 0x2d 0x47 (kA-G)
[ 30000000 ] 0x3e 0x64 0x69 0x47 (>diG)
[ 30500000 ] 0x6c 0x2b 0x4b 0x48 (l+KH)
[ 31000000 ] 0x3f 0x4e 0x2c 0x49 (?N,I)
[ 31500000 ] 0x6d 0x70 0x68 0x49 (mphI)
[ 32000000 ] 0x40 0x38 0x4a 0x4a (@8JJ)
[ 32500000 ] 0x6e 0x5a 0x2b 0x4b (nZ+K)
[ 33000000 ] 0x41 0x22 0x68 0x4b (A"hK)
[ 33500000 ] 0x6f 0x44 0x49 0x4c (oDIL)
[ 34000000 ] 0x42 0x67 0x2a 0x4d (Bg*M)
[ 34500000 ] 0x70 0x2e 0x67 0x4d (p.gM)
[ 35000000 ] 0x43 0x51 0x48 0x4e (CQHN)
[ 35500000 ] 0x71 0x73 0x29 0x4f (qs)O)
[ 36000000 ] 0x44 0x3b 0x66 0x4f (D;fO)
[ 36500000 ] 0x72 0x5d 0x47 0x50 (r]GP)
[ 37000000 ] 0x45 0x25 0x29 0x51 (E%)Q)
[ 37500000 ] 0x73 0x47 0x65 0x51 (sGeQ)
[ 38000000 ] 0x46 0x6a 0x46 0x52 (FjFR)
[ 38500000 ] 0x74 0x31 0x28 0x53 (t1(S)
[ 39000000 ] 0x47 0x54 0x64 0x53 (GTdS)
[ 39500000 ] 0x75 0x76 0x45 0x54 (uvET)
[ 40000000 ] 0x48 0x3e 0x27 0x55 (H>'U)
[ 40500000 ] 0x76 0x60 0x63 0x55 (v`cU)
[ 41000000 ] 0x49 0x28 0x45 0x56 (I(EV)
[ 41500000 ] 0x77 0x4a 0x26 0x57 (wJ&W)
[ 42000000 ] 0x4a 0x6d 0x62 0x57 (JmbW)
[ 42500000 ] 0x78 0x34 0x44 0x58 (x4DX)
[ 43000000 ] 0x4b 0x57 0x25 0x59 (KW%Y)
[ 43500000 ] 0x79 0x79 0x61 0x59 (yyaY)
[ 44000000 ] 0x4c 0x41 0x43 0x5a (LACZ)
[ 44500000 ] 0x7a 0x63 0x24 0x5b (zc$[)
[ 45000000 ] 0x4d 0x2b 0x61 0x5b (M+a[)
[ 45500000 ] 0x20 0x4e 0x42 0x5c ( NB\)
[ 46000000 ] 0x4e 0x70 0x23 0x5d (Np#])
[ 46500000 ] 0x21 0x38 0x60 0x5d (!8`])
[ 47000000 ] 0x4f 0x5a 0x41 0x5e (OZA^)
[ 47500000 ] 0x22 0x22 0x23 0x5f (""#_)
[ 48000000 ] 0x50 0x44 0x5f 0x5f (PD__)
[ 48500000 ] 0x23 0x67 0x40 0x60 (#g@`)
[ 49000000 ] 0x51 0x2e 0x22 0x61 (Q."a)
[ 49500000 ] 0x24 0x51 0x5e 0x61 ($Q^a)
[ 50000000 ] 0x52 0x73 0x3f 0x62 (Rs?b)
[ 50500000 ] 0x25 0x3b 0x21 0x63 (%;!c)
[ 51000000 ] 0x53 0x5d 0x5d 0x63 (S]]c)
[ 51500000 ] 0x26 0x25 0x3f 0x64 (&%?d)
[ 52000000 ] 0x54 0x47 0x20 0x65 (TG e)
[ 52500000 ] 0x27 0x6a 0x5c 0x65 ('j\e)
[ 53000000 ] 0x55 0x31 0x3e 0x66 (U1>f)
[ 53500000 ] 0x28 0x54 0x7a 0x66 ((Tzf)
[ 54000000 ] 0x56 0x76 0x5b 0x67 (Vv[g)
[ 54500000 ] 0x29 0x3e 0x3d 0x68 ()>=h)
[ 55000000 ] 0x57 0x60 0x79 0x68 (W`yh)
[ 55500000 ] 0x2a 0x28 0x5b 0x69 (*([i)
[ 56000000 ] 0x58 0x4a 0x3c 0x6a (XJ<j)
[ 56500000 ] 0x2b 0x6d 0x78 0x6a (+mxj)
[ 57000000 ] 0x59 0x34 0x5a 0x6b (Y4Zk)
[ 57500000 ] 0x2c 0x57 0x3b 0x6c (,W;l)
[ 58000000 ] 0x5a 0x79 0x77 0x6c (Zywl)
[ 58500000 ] 0x2d 0x41 0x59 0x6d (-AYm)
[ 59000000 ] 0x5b 0x63 0x3a 0x6e ([c:n)
[ 59500000 ] 0x2e 0x2b 0x77 0x6e (.+wn)
[ 60000000 ] 0x5c 0x4d 0x58 0x6f (\MXo)
[ 60500000 ] 0x2f 0x70 0x39 0x70 (/p9p)
[ 61000000 ] 0x5d 0x37 0x76 0x70 (]7vp)
[ 61500000 ] 0x30 0x5a 0x57 0x71 (0ZWq)
[ 62000000 ] 0x5e 0x21 0x39 0x72 (^!9r)
[ 62500000 ] 0x31 0x44 0x75 0x72 (1Dur)
[ 63000000 ] 0x5f 0x66 0x56 0x73 (_fVs)
[ 63239711 ] 0x70 0x61 0x73 0x73 (pass)
MATCH [pass] [29bad1457ee5e49e]==[29bad1457ee5e49e]

Profile

After looking at the code and profiling it, I believe that there are multiple areas which could be parallelized and optimised to reduce the execution time significantly. Most of the execution time takes place in the brute(char const*) and hash_password(unsigned long*, char const*) functions, so I will focus most of my effort and time into those areas when the time comes to optimise them.

Mysql profile.png
Mysql graph.png

Assignment 2

Kernel Attempts

So far I have made attempts to convert different pieces of compute heavy code into kernels in order to obtain the same results as the original code but none have been successful.

Original Code 1:

unsigned long nr=1345345333L;
unsigned long add=7;
unsigned long nr2=0x12345671L;
unsigned long tmp;
int passLen = strlen(password);

for (int i = 0; i < passLen; i++) {
   if (*password == ' ' || *password == '\t')
      continue;
   tmp= (unsigned long) (unsigned char) password[i];
   nr^= (((nr & 63)+add)*tmp)+ (nr << 8);
   nr2+=(nr2 << 8) ^ nr;
   add+=tmp;
}

Kernel Attempt 1(Not working):

__global__ void hash_password_kernel(unsigned long* nr, unsigned long* nr2, unsigned long* add, const char *password) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   unsigned long tmp;

   tmp= (unsigned long) (unsigned char) password[idx];
   *nr^= (((*nr & 63)+*add)*tmp)+ (*nr << 8);
   *nr2+=(*nr2 << 8) ^ *nr;
   *add+=tmp;

}

Original Code 2:

for(int i=pos; i<max; i++) {
   if(data[i] != max) {
   data[i]++;
   pos=i;
   break;
   }
}

Kernel Attempt 2(Not working):

__global__ void testKernel(unsigned char *data, unsigned int max, unsigned int* pos) {
   if(data[threadIdx.x] != max) {
      data[threadIdx.x]++;
      *pos=threadIdx.x;
   }
}

Original Code 3:

memset(encrypted_password, 0, 255);
memset((char*)&data, min, 255);

Kernal Attempt 3(Working):

__global__ void initialise(unsigned char* data, char* encrypted_password) {
   data[threadIdx.x] = 32;
   encrypted_password[threadIdx.x] = 0;
}

-Currently I am trying to get the above kernels to work before handing in the assignment as I feel that having just the initialization kernel would not be nearly sufficient for the purpose of this assignment.

Work With Prefix Scan

-At the moment I am working on various simplified versions of a prefix sum algorithm that I am hoping will lead me on the right path to completing my assignment. These algorithms have been gathered from various sources such as MIT, NVIDIA, as well as CUDA documentation.

-Below is a sequential prescan used to perform a prescan on an array.

void scan( float* arr1, float* input, int n) {
   output[0] = 0; // since this is an exclusive scan, we do not include the first element
   for(int i = 1; i < length; ++i) {
      arr1[i] = input[i-1] + arr1[i-1];
   }
}

-Below is a parallel scan which does the same thing as the above function

global__ void scan(float *g_odata, float *g_idata, int n) {
   extern __shared__ float temp[]; // allocated on invocation

   int thid = threadIdx.x;
   int pout = 0, pin = 1;

   // load input into shared memory.
   // This is exclusive scan, so shift right by one and set first elt to 0
   temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
   __syncthreads();

   for (int offset = 1; offset < n; offset *= 2) {
      pout = 1 - pout; // swap double buffer indices
      pin = 1 - pout;

      if (thid >= offset)
         temp[pout*n+thid] += temp[pin*n+thid - offset];
      else
      temp[pout*n+thid] = temp[pin*n+thid];

      __syncthreads();
   }

   g_odata[thid] = temp[pout*n+thid1]; // write output
}

Assignment 3

-In the end I found a fairly simplified encryption program and decided to work with that so I could at least get something to hand in in the end. Encryption works with every letter of the string or file you are working with, and therefore is a perfect candidate to be parallelized. -Here is the source code snippit of the CPU code:

void encrypt(char *inp,char *out,int key)  
 {  
     std::ifstream input;  
     std::ofstream output;  
     char buf;  
     input.open(inp);  
     output.open(out);  
     buf=input.get();  
     
     while(!input.eof())  
     {  
         if(buf>='a'&&buf<='z') {  
             buf-='a';  
             buf+=key;  
             buf%=26;  
             buf+='A';  
         }  
         else if(buf>='A'&&buf<='Z') {
             buf-='A';  
             buf+=26-key;  
             buf%=26;  
             buf+='a';
         }
         output.put(buf);  
         buf=input.get();  
     }  
     input.close();  
     output.close();  
     //readText(inp);  
     //readText(out);  
 }

-I then created a kernel which was very simple

__global__ void encrypt2(char *inp, int key) {
    int i = threadIdx.x;
    if(inp[i]>='a'&&inp[i]<='z') {  
        inp[i]-='a';  
        inp[i]+=key;  
        inp[i]%=26;  
        inp[i]+='A';  
    }  
    else if(inp[i]>='A'&&inp[i]<='Z') {
        inp[i]-='A';  
        inp[i]+=26-key;  
        inp[i]%=26;  
        inp[i]+='a';
    }
}

-and optimized it to allow for larger sized strings (so all of the threads would not be operating on just one block and leaving the other streaming multiprocessors idle.

__global__ void encrypt2(char *inp, int key) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(inp[i]>='a'&&inp[i]<='z') {  
        inp[i]-='a';  
        inp[i]+=key;  
        inp[i]%=26;  
        inp[i]+='A';  
    }  
    else if(inp[i]>='A'&&inp[i]<='Z') {
        inp[i]-='A';  
        inp[i]+=26-key;  
        inp[i]%=26;  
        inp[i]+='a';
    }
}