Difference between revisions of "GPU610/Team AGC"
(→1-D Wave Equation) |
(→1-D Wave Equation) |
||
Line 237: | Line 237: | ||
[[Image:wave2.gif]] | [[Image:wave2.gif]] | ||
+ | |||
+ | |||
+ | The first step was to create a private repository on Bitbucket to avoid any plagerism issues with this course for next semester students, as well as provide code revision and protection in case my progress is lost or corrupt. | ||
+ | |||
+ | The next step is to convert the following C file into C++ code that will be compatible with CUDA. | ||
+ | |||
+ | mpi_wave.c | ||
+ | <pre> | ||
+ | /*************************************************************************** | ||
+ | * FILE: mpi_wave.c | ||
+ | * OTHER FILES: draw_wave.c | ||
+ | * DESCRIPTION: | ||
+ | * MPI Concurrent Wave Equation - C Version | ||
+ | * Point-to-Point Communications Example | ||
+ | * This program implements the concurrent wave equation described | ||
+ | * in Chapter 5 of Fox et al., 1988, Solving Problems on Concurrent | ||
+ | * Processors, vol 1. | ||
+ | * A vibrating string is decomposed into points. Each processor is | ||
+ | * responsible for updating the amplitude of a number of points over | ||
+ | * time. At each iteration, each processor exchanges boundary points with | ||
+ | * nearest neighbors. This version uses low level sends and receives | ||
+ | * to exchange boundary points. | ||
+ | * AUTHOR: Blaise Barney. Adapted from Ros Leibensperger, Cornell Theory | ||
+ | * Center. Converted to MPI: George L. Gusciora, MHPCC (1/95) | ||
+ | * LAST REVISED: 07/05/05 | ||
+ | ***************************************************************************/ | ||
+ | #include "mpi.h" | ||
+ | #include <stdio.h> | ||
+ | #include <stdlib.h> | ||
+ | #include <math.h> | ||
+ | |||
+ | #define MASTER 0 | ||
+ | #define TPOINTS 800 | ||
+ | #define MAXSTEPS 10000 | ||
+ | #define PI 3.14159265 | ||
+ | |||
+ | int RtoL = 10; | ||
+ | int LtoR = 20; | ||
+ | int OUT1 = 30; | ||
+ | int OUT2 = 40; | ||
+ | |||
+ | void init_master(void); | ||
+ | void init_workers(void); | ||
+ | void init_line(void); | ||
+ | void update (int left, int right); | ||
+ | void output_master(void); | ||
+ | void output_workers(void); | ||
+ | extern void draw_wave(double *); | ||
+ | |||
+ | int taskid, /* task ID */ | ||
+ | numtasks, /* number of processes */ | ||
+ | nsteps, /* number of time steps */ | ||
+ | npoints, /* number of points handled by this processor */ | ||
+ | first; /* index of 1st point handled by this processor */ | ||
+ | double etime, /* elapsed time in seconds */ | ||
+ | values[TPOINTS+2], /* values at time t */ | ||
+ | oldval[TPOINTS+2], /* values at time (t-dt) */ | ||
+ | newval[TPOINTS+2]; /* values at time (t+dt) */ | ||
+ | |||
+ | /* ------------------------------------------------------------------------ | ||
+ | * Master obtains timestep input value from user and broadcasts it | ||
+ | * ------------------------------------------------------------------------ */ | ||
+ | void init_master(void) { | ||
+ | char tchar[8]; | ||
+ | |||
+ | |||
+ | /* Set number of number of time steps and then print and broadcast*/ | ||
+ | nsteps = 0; | ||
+ | while ((nsteps < 1) || (nsteps > MAXSTEPS)) { | ||
+ | printf("Enter number of time steps (1-%d): \n",MAXSTEPS); | ||
+ | scanf("%s", tchar); | ||
+ | nsteps = atoi(tchar); | ||
+ | if ((nsteps < 1) || (nsteps > MAXSTEPS)) | ||
+ | printf("Enter value between 1 and %d\n", MAXSTEPS); | ||
+ | } | ||
+ | MPI_Bcast(&nsteps, 1, MPI_INT, MASTER, MPI_COMM_WORLD); | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------- | ||
+ | * Workers receive timestep input value from master | ||
+ | * -------------------------------------------------------------------------*/ | ||
+ | void init_workers(void) { | ||
+ | MPI_Bcast(&nsteps, 1, MPI_INT, MASTER, MPI_COMM_WORLD); | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------ | ||
+ | * All processes initialize points on line | ||
+ | * --------------------------------------------------------------------- */ | ||
+ | void init_line(void) { | ||
+ | int nmin, nleft, npts, i, j, k; | ||
+ | double x, fac; | ||
+ | |||
+ | /* calculate initial values based on sine curve */ | ||
+ | nmin = TPOINTS/numtasks; | ||
+ | nleft = TPOINTS%numtasks; | ||
+ | fac = 2.0 * PI; | ||
+ | for (i = 0, k = 0; i < numtasks; i++) { | ||
+ | npts = (i < nleft) ? nmin + 1 : nmin; | ||
+ | if (taskid == i) { | ||
+ | first = k + 1; | ||
+ | npoints = npts; | ||
+ | printf ("task=%3d first point=%5d npoints=%4d\n", taskid, | ||
+ | first, npts); | ||
+ | for (j = 1; j <= npts; j++, k++) { | ||
+ | x = (double)k/(double)(TPOINTS - 1); | ||
+ | values[j] = sin (fac * x); | ||
+ | } | ||
+ | } | ||
+ | else k += npts; | ||
+ | } | ||
+ | for (i = 1; i <= npoints; i++) | ||
+ | oldval[i] = values[i]; | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------- | ||
+ | * All processes update their points a specified number of times | ||
+ | * -------------------------------------------------------------------------*/ | ||
+ | void update(int left, int right) { | ||
+ | int i, j; | ||
+ | double dtime, c, dx, tau, sqtau; | ||
+ | MPI_Status status; | ||
+ | |||
+ | dtime = 0.3; | ||
+ | c = 1.0; | ||
+ | dx = 1.0; | ||
+ | tau = (c * dtime / dx); | ||
+ | sqtau = tau * tau; | ||
+ | |||
+ | /* Update values for each point along string */ | ||
+ | for (i = 1; i <= nsteps; i++) { | ||
+ | /* Exchange data with "left-hand" neighbor */ | ||
+ | if (first != 1) { | ||
+ | MPI_Send(&values[1], 1, MPI_DOUBLE, left, RtoL, MPI_COMM_WORLD); | ||
+ | MPI_Recv(&values[0], 1, MPI_DOUBLE, left, LtoR, MPI_COMM_WORLD, | ||
+ | &status); | ||
+ | } | ||
+ | /* Exchange data with "right-hand" neighbor */ | ||
+ | if (first + npoints -1 != TPOINTS) { | ||
+ | MPI_Send(&values[npoints], 1, MPI_DOUBLE, right, LtoR, MPI_COMM_WORLD); | ||
+ | MPI_Recv(&values[npoints+1], 1, MPI_DOUBLE, right, RtoL, | ||
+ | MPI_COMM_WORLD, &status); | ||
+ | } | ||
+ | /* Update points along line */ | ||
+ | for (j = 1; j <= npoints; j++) { | ||
+ | /* Global endpoints */ | ||
+ | if ((first + j - 1 == 1) || (first + j - 1 == TPOINTS)) | ||
+ | newval[j] = 0.0; | ||
+ | else | ||
+ | /* Use wave equation to update points */ | ||
+ | newval[j] = (2.0 * values[j]) - oldval[j] | ||
+ | + (sqtau * (values[j-1] - (2.0 * values[j]) + values[j+1])); | ||
+ | } | ||
+ | for (j = 1; j <= npoints; j++) { | ||
+ | oldval[j] = values[j]; | ||
+ | values[j] = newval[j]; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------ | ||
+ | * Master receives results from workers and prints | ||
+ | * ------------------------------------------------------------------------ */ | ||
+ | void output_master(void) { | ||
+ | int i, j, source, start, npts, buffer[2]; | ||
+ | double results[TPOINTS]; | ||
+ | MPI_Status status; | ||
+ | |||
+ | /* Store worker's results in results array */ | ||
+ | for (i = 1; i < numtasks; i++) { | ||
+ | /* Receive first point, number of points and results */ | ||
+ | MPI_Recv(buffer, 2, MPI_INT, i, OUT1, MPI_COMM_WORLD, &status); | ||
+ | start = buffer[0]; | ||
+ | npts = buffer[1]; | ||
+ | MPI_Recv(&results[start-1], npts, MPI_DOUBLE, i, OUT2, | ||
+ | MPI_COMM_WORLD, &status); | ||
+ | } | ||
+ | |||
+ | /* Store master's results in results array */ | ||
+ | for (i = first; i < first + npoints; i++) | ||
+ | results[i-1] = values[i]; | ||
+ | |||
+ | j = 0; | ||
+ | printf("***************************************************************\n"); | ||
+ | printf("Final amplitude values for all points after %d steps:\n",nsteps); | ||
+ | for (i = 0; i < TPOINTS; i++) { | ||
+ | printf("%6.2f ", results[i]); | ||
+ | j = j++; | ||
+ | if (j == 10) { | ||
+ | printf("\n"); | ||
+ | j = 0; | ||
+ | } | ||
+ | } | ||
+ | printf("***************************************************************\n"); | ||
+ | printf("\nDrawing graph...\n"); | ||
+ | printf("Click the EXIT button or use CTRL-C to quit\n"); | ||
+ | |||
+ | /* display results with draw_wave routine */ | ||
+ | draw_wave(&results[0]); | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------- | ||
+ | * Workers send the updated values to the master | ||
+ | * -------------------------------------------------------------------------*/ | ||
+ | |||
+ | void output_workers(void) { | ||
+ | int buffer[2]; | ||
+ | MPI_Status status; | ||
+ | |||
+ | /* Send first point, number of points and results to master */ | ||
+ | buffer[0] = first; | ||
+ | buffer[1] = npoints; | ||
+ | MPI_Send(&buffer, 2, MPI_INT, MASTER, OUT1, MPI_COMM_WORLD); | ||
+ | MPI_Send(&values[1], npoints, MPI_DOUBLE, MASTER, OUT2, MPI_COMM_WORLD); | ||
+ | } | ||
+ | |||
+ | /* ------------------------------------------------------------------------ | ||
+ | * Main program | ||
+ | * ------------------------------------------------------------------------ */ | ||
+ | |||
+ | int main (int argc, char *argv[]) | ||
+ | { | ||
+ | int left, right, rc; | ||
+ | |||
+ | /* Initialize MPI */ | ||
+ | MPI_Init(&argc,&argv); | ||
+ | MPI_Comm_rank(MPI_COMM_WORLD,&taskid); | ||
+ | MPI_Comm_size(MPI_COMM_WORLD,&numtasks); | ||
+ | if (numtasks < 2) { | ||
+ | printf("ERROR: Number of MPI tasks set to %d\n",numtasks); | ||
+ | printf("Need at least 2 tasks! Quitting...\n"); | ||
+ | MPI_Abort(MPI_COMM_WORLD, rc); | ||
+ | exit(0); | ||
+ | } | ||
+ | |||
+ | /* Determine left and right neighbors */ | ||
+ | if (taskid == numtasks-1) | ||
+ | right = 0; | ||
+ | else | ||
+ | right = taskid + 1; | ||
+ | |||
+ | if (taskid == 0) | ||
+ | left = numtasks - 1; | ||
+ | else | ||
+ | left = taskid - 1; | ||
+ | |||
+ | /* Get program parameters and initialize wave values */ | ||
+ | if (taskid == MASTER) { | ||
+ | printf ("Starting mpi_wave using %d tasks.\n", numtasks); | ||
+ | printf ("Using %d points on the vibrating string.\n", TPOINTS); | ||
+ | init_master(); | ||
+ | } | ||
+ | else | ||
+ | init_workers(); | ||
+ | |||
+ | init_line(); | ||
+ | |||
+ | /* Update values along the line for nstep time steps */ | ||
+ | update(left, right); | ||
+ | |||
+ | /* Master collects results from workers and prints */ | ||
+ | if (taskid == MASTER) | ||
+ | output_master(); | ||
+ | else | ||
+ | output_workers(); | ||
+ | |||
+ | MPI_Finalize(); | ||
+ | return 0; | ||
+ | } | ||
+ | </pre> |
Revision as of 21:36, 24 November 2014
GPU610/DPS915 | Student List | Group and Project Index | Student Resources | Glossary
Team AGC
Team Members
- Andy Cooc, Some responsibility
- Gabriel Castro, Some other responsibility
- Christopher Markieta, Some other responsibility
Progress
Assignment 1
Andy's Findings
...
Gabriel's Findings
Pillow Library (Python)
The Pillow library is an image manipulation library for python. It has many common functions like re-sizing, cropping, rotating, blur, sharpen etc. It's available on github at https://github.com/python-pillow/Pillow
I used 4 filters for testing
filter | run time |
---|---|
Blur | 1.305s |
Gaussian Blur | 4.492 |
Auto Contrast | 0.092 |
Grey Scale | 0.046 |
See the code on github.
Gaussian Blur
A gaussian blur is a filter that can be applied to an image to make it blurry, in its simplest form it goes through every pixel in an image and averages it with its surrounding pixels. In the Pillow library it is implemented in a C module. This helps with processing time as C is hundreds of times faster than python. A quick look at the source shows that algorithm contains a nested for loop that's O(x * y) and then one that's O(x * y * r), that means that time grows n^2 based on the size of the image and n^3 based on the size of the blur radius.
Conclusion
With many nested operations, this a really good candidate for GPU paralization.
Christopher's Findings
xor_me
xor_me is an open source, brute force password cracker for doc and xls files.
xor_me was written by Benoît Sibaud and is copyrighted under the terms of the GNU Lesser General Public License version 3 only, as published by the Free Software Foundation. Here is a link to my fork of this repository:
https://github.com/Markieta/xor_me
Here is the profile of a short run of brute_force:
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self total time seconds seconds calls Ts/call Ts/call name 99.35 10.58 10.58 lclGetKey(unsigned char const*, long) 0.19 10.60 0.02 dump_exit(int) 0.00 10.60 0.00 1 0.00 0.00 _GLOBAL__sub_I__Z9lclGetKeyPKhl
As you can see, lclGetKey is the function where I will spend most of my time trying to parallelize code.
This application is quite CPU intensive, but hasn't been optimized for multiple threads. It is using 100% of 1 thread from the 8 available on my CPU. See the list of processes below for an example.
top - 23:25:32 up 1:16, 3 users, load average: 0.94, 0.65, 0.60 Tasks: 256 total, 2 running, 253 sleeping, 0 stopped, 1 zombie %Cpu(s): 13.7 us, 0.7 sy, 0.0 ni, 85.5 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem: 3979012 total, 3270196 used, 708816 free, 352592 buffers KiB Swap: 4124668 total, 0 used, 4124668 free. 984048 cached Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 5229 markieta 20 0 12664 1064 896 R 100.0 0.0 1:06.04 brute_force 1467 root 20 0 538620 208272 166664 S 3.7 5.2 6:55.85 Xorg 2540 markieta 20 0 1084328 169500 53460 S 3.3 4.3 11:57.88 chrome 2075 markieta 9 -11 509016 6840 4500 S 2.7 0.2 3:21.47 pulseaudio 3022 markieta 20 0 1094124 79252 10220 S 2.3 2.0 3:05.77 chrome 2932 markieta 20 0 829212 43176 19848 S 1.7 1.1 0:31.29 /usr/bin/termin 2989 markieta 20 0 1039100 117292 23968 S 1.7 2.9 1:48.53 chrome 2296 markieta 20 0 1553220 170248 65592 S 1.0 4.3 3:16.43 compiz 2789 markieta 20 0 1064160 99040 20116 S 0.7 2.5 1:24.61 chrome 2993 markieta 20 0 1112596 155316 26192 S 0.7 3.9 1:31.43 chrome 3482 root 20 0 0 0 0 S 0.3 0.0 0:07.34 kworker/2:2 4364 root 20 0 0 0 0 S 0.3 0.0 0:00.96 kworker/0:0 5230 markieta 20 0 24952 1776 1176 R 0.3 0.0 0:00.14 top
Here I ran the brute_force function again to see how long it would really take. I did not run this command for too long, hence the the termination ^C
after about 2 minutes.
markieta@Y560:~/Documents/xor_me$ time ./brute_force 0x499a 0xcc61 Key: 499a Hash: cc61 Password: '1950' Password: 'w2#1' ^CState: 20:22:48:46:7e:48:62:c3c0:00000184023807d008a011c010800000:6c62477d45472100 real 2m16.888s user 2m16.082s sys 0m0.640s
I am hoping to see a lot of improvement after my research and implementation with CUDA and OpenCL.
Team AGC has decided to collaborate on this project for assignment 2 onwards.
Assignment 2
Christopher's Findings
xor_me
This project has been dropped due to the complexity of the code and difficulty with respect to this assignment.
Since the majority of CPU cycles is spent in the most inner for-loop, the goal is to parallelize this code only. It is possible to parallelize all 8 for-loops (Maximum of 8 character password) because they are performing very similar tasks. However, due to the time constraint and purpose of this assignment, I will only focus on this one. See below for the following code to be optimized for our GPU:
for (o=32; o < 128; ++o) { skipInits: unsigned short x = nHash ^ hash; lclRotateRight(x, 1); if (32 <= x && x < 127) { t[0] = static_cast<unsigned char>(x); if (nKey == lclGetKey(t, 16)) { std::cout << "Password: '" << t << "'" << std::endl; } } hash ^= r[1]; r[1] = t[1] = o; lclRotateLeft(r[1], 2); hash ^= r[1]; if (o == 32) { r[0] = '\0'; hash = lclGetHash(t, r, 16); } }
My fork for this application can be found here: https://github.com/Markieta/xor_me
At first it seems impossible to do because of the data dependencies: hash, r[1], t[1], and o. But after doing some research, I have learned that the XOR operation is communicative and associative, meaning that we can simply calculate all of the dependencies in parallel first, then combine the results to achieve the same data.
Here is a non-formal proof to help make my point clear. The ability to achieve the same result vertically and horizontally will be the key to solving this task.
Step 1: 101 ^ 001 ^ 111 = 001 ^ ^ ^ 111 ^ 100 ^ 001 = 010 ^ ^ ^ 100 ^ 010 ^ 001 = 111 || || || 110 111 111 Step 2: Operate with vertical and horizontal 001 ^ 010 ^ 111 = 110 110 ^ 111 ^ 111 = 110
The first iteration of this loop executes a special condition which we must deal with before anything else:
if (o == 32) { r[0] = '\0'; hash = lclGetHash(t, r, 16); }
Although r[1] and t[1] are equivalent for some time during the loop, we will keep them on separate arrays for simplicity (lclRotateLeft takes a reference to r at index 1):
r[1] = t[1] = o; lclRotateLeft(r[1], 2);
After struggling for a few hours I realized that I should be using Thrust instead:
// unsigned short* h_r = new unsigned short[LOOPSIZE * RT_SIZE]; // unsigned char* h_t = new unsigned char[LOOPSIZE * RT_SIZE]; // unsigned char* h_x = new unsigned char[LOOPSIZE]; thrust::host_vector<unsigned short> h_r(LOOPSIZE * RT_SIZE); thrust::host_vector<unsigned short> h_hash(LOOPSIZE); thrust::host_vector<unsigned char> h_t(LOOPSIZE * RT_SIZE); thrust::host_vector<unsigned char> h_x(LOOPSIZE); // unsigned short* d_r; // unsigned char* d_t; // unsigned char* d_x; // cudaMalloc((void**)&d_r, LOOPSIZE * RT_SIZE); // cudaMalloc((void**)&d_t, LOOPSIZE * RT_SIZE); // cudaMalloc((void**)&d_x, LOOPSIZE);
The following transformation allows the computation of all XOR operations needed in this phase:
thrust::device_vector<unsigned short> d_r = h_r; thrust::device_vector<unsigned short> d_hash = h_hash; thrust::device_vector<unsigned char> d_t = h_t; thrust::device_vector<unsigned char> d_x = h_x; thrust::device_vector<unsigned short> xor_hash(LOOPSIZE); // XOR results // For XOR operations with r[1] // thrust::device_ptr<unsigned short> pHash = &d_hash[0]; // thrust::device_reference<unsigned short> rHash(pHash); thrust::transform(d_hash.begin(), d_hash.end(), d_r.begin(), xor_hash.begin(), thrust::bit_xor<int>());
Now to check each result with x and t[1] if the password has been found, otherwise combine the XOR results by XOR'ing them together for the next iteration to use.
Assignment 3
Christopher's Findings
Due to the difficulty of Assignment 2, I have chosen a new project to parallelize.
1-D Wave Equation
The first step was to create a private repository on Bitbucket to avoid any plagerism issues with this course for next semester students, as well as provide code revision and protection in case my progress is lost or corrupt.
The next step is to convert the following C file into C++ code that will be compatible with CUDA.
mpi_wave.c
/*************************************************************************** * FILE: mpi_wave.c * OTHER FILES: draw_wave.c * DESCRIPTION: * MPI Concurrent Wave Equation - C Version * Point-to-Point Communications Example * This program implements the concurrent wave equation described * in Chapter 5 of Fox et al., 1988, Solving Problems on Concurrent * Processors, vol 1. * A vibrating string is decomposed into points. Each processor is * responsible for updating the amplitude of a number of points over * time. At each iteration, each processor exchanges boundary points with * nearest neighbors. This version uses low level sends and receives * to exchange boundary points. * AUTHOR: Blaise Barney. Adapted from Ros Leibensperger, Cornell Theory * Center. Converted to MPI: George L. Gusciora, MHPCC (1/95) * LAST REVISED: 07/05/05 ***************************************************************************/ #include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #define MASTER 0 #define TPOINTS 800 #define MAXSTEPS 10000 #define PI 3.14159265 int RtoL = 10; int LtoR = 20; int OUT1 = 30; int OUT2 = 40; void init_master(void); void init_workers(void); void init_line(void); void update (int left, int right); void output_master(void); void output_workers(void); extern void draw_wave(double *); int taskid, /* task ID */ numtasks, /* number of processes */ nsteps, /* number of time steps */ npoints, /* number of points handled by this processor */ first; /* index of 1st point handled by this processor */ double etime, /* elapsed time in seconds */ values[TPOINTS+2], /* values at time t */ oldval[TPOINTS+2], /* values at time (t-dt) */ newval[TPOINTS+2]; /* values at time (t+dt) */ /* ------------------------------------------------------------------------ * Master obtains timestep input value from user and broadcasts it * ------------------------------------------------------------------------ */ void init_master(void) { char tchar[8]; /* Set number of number of time steps and then print and broadcast*/ nsteps = 0; while ((nsteps < 1) || (nsteps > MAXSTEPS)) { printf("Enter number of time steps (1-%d): \n",MAXSTEPS); scanf("%s", tchar); nsteps = atoi(tchar); if ((nsteps < 1) || (nsteps > MAXSTEPS)) printf("Enter value between 1 and %d\n", MAXSTEPS); } MPI_Bcast(&nsteps, 1, MPI_INT, MASTER, MPI_COMM_WORLD); } /* ------------------------------------------------------------------------- * Workers receive timestep input value from master * -------------------------------------------------------------------------*/ void init_workers(void) { MPI_Bcast(&nsteps, 1, MPI_INT, MASTER, MPI_COMM_WORLD); } /* ------------------------------------------------------------------------ * All processes initialize points on line * --------------------------------------------------------------------- */ void init_line(void) { int nmin, nleft, npts, i, j, k; double x, fac; /* calculate initial values based on sine curve */ nmin = TPOINTS/numtasks; nleft = TPOINTS%numtasks; fac = 2.0 * PI; for (i = 0, k = 0; i < numtasks; i++) { npts = (i < nleft) ? nmin + 1 : nmin; if (taskid == i) { first = k + 1; npoints = npts; printf ("task=%3d first point=%5d npoints=%4d\n", taskid, first, npts); for (j = 1; j <= npts; j++, k++) { x = (double)k/(double)(TPOINTS - 1); values[j] = sin (fac * x); } } else k += npts; } for (i = 1; i <= npoints; i++) oldval[i] = values[i]; } /* ------------------------------------------------------------------------- * All processes update their points a specified number of times * -------------------------------------------------------------------------*/ void update(int left, int right) { int i, j; double dtime, c, dx, tau, sqtau; MPI_Status status; dtime = 0.3; c = 1.0; dx = 1.0; tau = (c * dtime / dx); sqtau = tau * tau; /* Update values for each point along string */ for (i = 1; i <= nsteps; i++) { /* Exchange data with "left-hand" neighbor */ if (first != 1) { MPI_Send(&values[1], 1, MPI_DOUBLE, left, RtoL, MPI_COMM_WORLD); MPI_Recv(&values[0], 1, MPI_DOUBLE, left, LtoR, MPI_COMM_WORLD, &status); } /* Exchange data with "right-hand" neighbor */ if (first + npoints -1 != TPOINTS) { MPI_Send(&values[npoints], 1, MPI_DOUBLE, right, LtoR, MPI_COMM_WORLD); MPI_Recv(&values[npoints+1], 1, MPI_DOUBLE, right, RtoL, MPI_COMM_WORLD, &status); } /* Update points along line */ for (j = 1; j <= npoints; j++) { /* Global endpoints */ if ((first + j - 1 == 1) || (first + j - 1 == TPOINTS)) newval[j] = 0.0; else /* Use wave equation to update points */ newval[j] = (2.0 * values[j]) - oldval[j] + (sqtau * (values[j-1] - (2.0 * values[j]) + values[j+1])); } for (j = 1; j <= npoints; j++) { oldval[j] = values[j]; values[j] = newval[j]; } } } /* ------------------------------------------------------------------------ * Master receives results from workers and prints * ------------------------------------------------------------------------ */ void output_master(void) { int i, j, source, start, npts, buffer[2]; double results[TPOINTS]; MPI_Status status; /* Store worker's results in results array */ for (i = 1; i < numtasks; i++) { /* Receive first point, number of points and results */ MPI_Recv(buffer, 2, MPI_INT, i, OUT1, MPI_COMM_WORLD, &status); start = buffer[0]; npts = buffer[1]; MPI_Recv(&results[start-1], npts, MPI_DOUBLE, i, OUT2, MPI_COMM_WORLD, &status); } /* Store master's results in results array */ for (i = first; i < first + npoints; i++) results[i-1] = values[i]; j = 0; printf("***************************************************************\n"); printf("Final amplitude values for all points after %d steps:\n",nsteps); for (i = 0; i < TPOINTS; i++) { printf("%6.2f ", results[i]); j = j++; if (j == 10) { printf("\n"); j = 0; } } printf("***************************************************************\n"); printf("\nDrawing graph...\n"); printf("Click the EXIT button or use CTRL-C to quit\n"); /* display results with draw_wave routine */ draw_wave(&results[0]); } /* ------------------------------------------------------------------------- * Workers send the updated values to the master * -------------------------------------------------------------------------*/ void output_workers(void) { int buffer[2]; MPI_Status status; /* Send first point, number of points and results to master */ buffer[0] = first; buffer[1] = npoints; MPI_Send(&buffer, 2, MPI_INT, MASTER, OUT1, MPI_COMM_WORLD); MPI_Send(&values[1], npoints, MPI_DOUBLE, MASTER, OUT2, MPI_COMM_WORLD); } /* ------------------------------------------------------------------------ * Main program * ------------------------------------------------------------------------ */ int main (int argc, char *argv[]) { int left, right, rc; /* Initialize MPI */ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Comm_size(MPI_COMM_WORLD,&numtasks); if (numtasks < 2) { printf("ERROR: Number of MPI tasks set to %d\n",numtasks); printf("Need at least 2 tasks! Quitting...\n"); MPI_Abort(MPI_COMM_WORLD, rc); exit(0); } /* Determine left and right neighbors */ if (taskid == numtasks-1) right = 0; else right = taskid + 1; if (taskid == 0) left = numtasks - 1; else left = taskid - 1; /* Get program parameters and initialize wave values */ if (taskid == MASTER) { printf ("Starting mpi_wave using %d tasks.\n", numtasks); printf ("Using %d points on the vibrating string.\n", TPOINTS); init_master(); } else init_workers(); init_line(); /* Update values along the line for nstep time steps */ update(left, right); /* Master collects results from workers and prints */ if (taskid == MASTER) output_master(); else output_workers(); MPI_Finalize(); return 0; }