Difference between revisions of "GPU610/TeamKappa"
(→Team Members) |
(→Assignment 1) |
||
Line 8: | Line 8: | ||
== Progress == | == Progress == | ||
=== Assignment 1 === | === Assignment 1 === | ||
+ | |||
+ | == Program 1: RSA Encryption == | ||
+ | |||
+ | '''Profiling''' | ||
+ | |||
+ | * gprof: Encryption 10MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 60.48 1.50 1.50 3495254 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 32.66 2.31 0.81 3495254 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 4.44 2.42 0.11 1 0.11 2.48 encryption(std::string, std::string, std::string) | ||
+ | 2.42 2.48 0.06 3495254 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 2.48 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 2.48 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 2.48 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 2.48 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | * gprof: Encryption 20MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 63.65 3.27 3.27 6990507 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 29.04 4.75 1.49 6990507 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 3.90 4.96 0.20 1 0.20 5.13 encryption(std::string, std::string, std::string) | ||
+ | 3.41 5.13 0.17 6990507 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 5.13 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 5.13 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 5.13 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 5.13 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | * gprof: Encryption 30MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 68.04 5.11 5.11 10485760 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 25.23 7.00 1.90 10485760 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 3.99 7.30 0.30 1 0.30 7.51 encryption(std::string, std::string, std::string) | ||
+ | 2.73 7.51 0.20 10485760 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 7.51 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 7.51 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 7.51 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 7.51 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | * gprof: Decryption 10MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 69.80 1.71 1.71 3495254 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 21.63 2.24 0.53 3495254 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 6.12 2.39 0.15 1 0.15 2.45 decryption(std::string, std::string, std::string) | ||
+ | 2.45 2.45 0.06 3495254 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 2.45 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 2.45 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 2.45 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 2.45 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | * gprof: Decryption 20MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 64.11 3.38 3.38 6990507 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 27.18 4.82 1.44 6990507 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 4.73 5.07 0.25 1 0.25 5.28 decryption(std::string, std::string, std::string) | ||
+ | 3.98 5.28 0.21 6990507 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 5.28 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 5.28 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 5.28 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 5.28 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | * gprof: Decryption 30MB | ||
+ | |||
+ | % cumulative self self total | ||
+ | time seconds seconds calls s/call s/call name | ||
+ | 66.88 5.25 5.25 10485760 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) | ||
+ | 24.59 7.18 1.93 10485760 0.00 0.00 findHighestSetBit(unsigned long long) | ||
+ | 5.73 7.63 0.45 1 0.45 7.85 decryption(std::string, std::string, std::string) | ||
+ | 2.80 7.85 0.22 10485760 0.00 0.00 intToByteArray(int, char*) | ||
+ | 0.00 7.85 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs | ||
+ | 0.00 7.85 0.00 1 0.00 0.00 _GLOBAL__sub_I_main | ||
+ | 0.00 7.85 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | 0.00 7.85 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) | ||
+ | |||
+ | |||
+ | '''Bottleneck Code''' | ||
+ | |||
+ | The two slowest parts of this encryption method are fastModularExp and findHighestSetBit. A third function, intToByteArray, takes up a relatively small amount of time but may still be able to be optimized. | ||
+ | |||
+ | unsigned int fastModularExp(ULong a, ULong b, ULong c) { | ||
+ | ULong result = 1; | ||
+ | ULong leadingbit = findHighestSetBit(b); // Heighest set bit | ||
+ | while(leadingbit > 0){ //while there are bits left | ||
+ | result = ((result*result) % c); //case 1: bit is a 0 | ||
+ | if((b & leadingbit) > 0){ | ||
+ | result = ((result * a) % c); //case 2: if bit is a 1 | ||
+ | } | ||
+ | leadingbit = leadingbit >> 1; | ||
+ | } | ||
+ | return (unsigned int)result; | ||
+ | } | ||
+ | |||
+ | ULong findHighestSetBit(ULong num){ | ||
+ | ULong result = 0; | ||
+ | for(int i = 63; i >= 0; i--){ | ||
+ | if(num & (1ULL << i)){ | ||
+ | result = 1ULL << i; | ||
+ | return result; | ||
+ | } | ||
+ | } | ||
+ | return result; | ||
+ | } | ||
+ | |||
+ | byte* intToByteArray(int num, byte *result){ | ||
+ | for(int i = 0; i < 4; i++){ | ||
+ | result[i] = (num & (0xFF << (8 *(3-i)))) >> (8 *(3-i)); | ||
+ | } | ||
+ | return result; | ||
+ | } | ||
+ | |||
+ | At first glance, the fastest function is also the function that appears like it would be the easiest to run on a GPU. The other code does not look like it can be optimized with a GPU easily as it does not use large arrays of N size. However, after some short internet research, several documents turned up on the topic of using a GPU to make RSA faster (example: [http://research.ijcaonline.org/volume87/number6/pxc3893704.pdf here] and [http://ijssst.info/Vol-15/No-3/data/3857a529.pdf here]). This leaves me hopeful that in the further stages of this assignment that this could be an interesting program to work with. | ||
+ | |||
=== Assignment 2 === | === Assignment 2 === | ||
=== Assignment 3 === | === Assignment 3 === |
Revision as of 15:57, 14 October 2015
Contents
Team Kappa
Team Members
- Ryan Mullings, Team Member
- Andy Cooc, Team Member
- Matt Jang, Team Member
Progress
Assignment 1
Program 1: RSA Encryption
Profiling
- gprof: Encryption 10MB
% cumulative self self total time seconds seconds calls s/call s/call name 60.48 1.50 1.50 3495254 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 32.66 2.31 0.81 3495254 0.00 0.00 findHighestSetBit(unsigned long long) 4.44 2.42 0.11 1 0.11 2.48 encryption(std::string, std::string, std::string) 2.42 2.48 0.06 3495254 0.00 0.00 intToByteArray(int, char*) 0.00 2.48 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 2.48 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 2.48 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 2.48 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
- gprof: Encryption 20MB
% cumulative self self total time seconds seconds calls s/call s/call name 63.65 3.27 3.27 6990507 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 29.04 4.75 1.49 6990507 0.00 0.00 findHighestSetBit(unsigned long long) 3.90 4.96 0.20 1 0.20 5.13 encryption(std::string, std::string, std::string) 3.41 5.13 0.17 6990507 0.00 0.00 intToByteArray(int, char*) 0.00 5.13 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 5.13 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 5.13 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.13 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
- gprof: Encryption 30MB
% cumulative self self total time seconds seconds calls s/call s/call name 68.04 5.11 5.11 10485760 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 25.23 7.00 1.90 10485760 0.00 0.00 findHighestSetBit(unsigned long long) 3.99 7.30 0.30 1 0.30 7.51 encryption(std::string, std::string, std::string) 2.73 7.51 0.20 10485760 0.00 0.00 intToByteArray(int, char*) 0.00 7.51 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 7.51 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 7.51 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 7.51 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
- gprof: Decryption 10MB
% cumulative self self total time seconds seconds calls s/call s/call name 69.80 1.71 1.71 3495254 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 21.63 2.24 0.53 3495254 0.00 0.00 findHighestSetBit(unsigned long long) 6.12 2.39 0.15 1 0.15 2.45 decryption(std::string, std::string, std::string) 2.45 2.45 0.06 3495254 0.00 0.00 intToByteArray(int, char*) 0.00 2.45 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 2.45 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 2.45 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 2.45 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
- gprof: Decryption 20MB
% cumulative self self total time seconds seconds calls s/call s/call name 64.11 3.38 3.38 6990507 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 27.18 4.82 1.44 6990507 0.00 0.00 findHighestSetBit(unsigned long long) 4.73 5.07 0.25 1 0.25 5.28 decryption(std::string, std::string, std::string) 3.98 5.28 0.21 6990507 0.00 0.00 intToByteArray(int, char*) 0.00 5.28 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 5.28 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 5.28 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 5.28 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
- gprof: Decryption 30MB
% cumulative self self total time seconds seconds calls s/call s/call name 66.88 5.25 5.25 10485760 0.00 0.00 fastModularExp(unsigned long long, unsigned long long, unsigned long long) 24.59 7.18 1.93 10485760 0.00 0.00 findHighestSetBit(unsigned long long) 5.73 7.63 0.45 1 0.45 7.85 decryption(std::string, std::string, std::string) 2.80 7.85 0.22 10485760 0.00 0.00 intToByteArray(int, char*) 0.00 7.85 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z10encryptionSsSsSs 0.00 7.85 0.00 1 0.00 0.00 _GLOBAL__sub_I_main 0.00 7.85 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int) 0.00 7.85 0.00 1 0.00 0.00 __static_initialization_and_destruction_0(int, int)
Bottleneck Code
The two slowest parts of this encryption method are fastModularExp and findHighestSetBit. A third function, intToByteArray, takes up a relatively small amount of time but may still be able to be optimized.
unsigned int fastModularExp(ULong a, ULong b, ULong c) { ULong result = 1; ULong leadingbit = findHighestSetBit(b); // Heighest set bit while(leadingbit > 0){ //while there are bits left result = ((result*result) % c); //case 1: bit is a 0 if((b & leadingbit) > 0){ result = ((result * a) % c); //case 2: if bit is a 1 } leadingbit = leadingbit >> 1; } return (unsigned int)result; }
ULong findHighestSetBit(ULong num){ ULong result = 0; for(int i = 63; i >= 0; i--){ if(num & (1ULL << i)){ result = 1ULL << i; return result; } } return result; }
byte* intToByteArray(int num, byte *result){ for(int i = 0; i < 4; i++){ result[i] = (num & (0xFF << (8 *(3-i)))) >> (8 *(3-i)); } return result; }
At first glance, the fastest function is also the function that appears like it would be the easiest to run on a GPU. The other code does not look like it can be optimized with a GPU easily as it does not use large arrays of N size. However, after some short internet research, several documents turned up on the topic of using a GPU to make RSA faster (example: here and here). This leaves me hopeful that in the further stages of this assignment that this could be an interesting program to work with.