c - Parallel slower than serial -
issue
hello everyone, have got program (from net) intend speed converting parallel version use of pthreads
. surprisingly though, runs slower serial version. below program:
# include <stdio.h> //fast square root algorithm double asmsqrt(double x) { __asm__ ("fsqrt" : "+t" (x)); return x; } //test if number prime bool isprime(int n) { if (n <= 1) return false; if (n == 2) return true; if (n%2 == 0) return false; int sqrtn,i; sqrtn = asmsqrt(n); (i = 3; <= sqrtn; i+=2) if (n%i == 0) return false; return true; } //number generator iterated 0 n int main() { n = 1000000; //maximum number int k,j; (j = 0; j<= n; j++) { if(isprime(j) == 1) k++; if(j == n) printf("count: %d\n",k); } return 0; }
first attempt parallelization
i let pthread
manage for loop
# include <stdio.h> . . int main() { . . //----->pthread code here<---- (j = 0; j<= n; j++) { if(isprime(j) == 1) k++; if(j == n) printf("count: %d\n",k); } return 0; }
well, runs slower serial one
second attempt
i divided for loop
two threads , run them in parallel using pthreads
however, still runs slower, intending may run twice fast or faster. not!
these parallel code way:
# include <stdio.h> # include <pthread.h> # include <cmath> # define nthreads 2 pthread_mutex_t mutex1 = pthread_mutex_initializer; int k = 0; double asmsqrt(double x) { __asm__ ("fsqrt" : "+t" (x)); return x; } struct arg_struct { int initialprime; int nextprime; }; bool isprime(int n) { if (n <= 1) return false; if (n == 2) return true; if (n%2 == 0) return false; int sqrtn,i; sqrtn = asmsqrt(n); (i = 3; <= sqrtn; i+=2) if (n%i == 0) return false; return true; } void *parallel_launcher(void *arguments) { struct arg_struct *args = (struct arg_struct *)arguments; int j = args -> initialprime; int n = args -> nextprime - 1; (j = 0; j<= n; j++) { if(isprime(j) == 1) { printf("this prime: %d\n",j); pthread_mutex_lock( &mutex1 ); k++; pthread_mutex_unlock( &mutex1 ); } if(j == n) printf("count: %d\n",k); } pthread_exit(null); } int main() { int f = 100000000; int m; pthread_t thread_id[nthreads]; struct arg_struct args; int rem = (f+1)%nthreads; int n = floor((f+1)/nthreads); for(int h = 0; h < nthreads; h++) { if(rem > 0) { m = n + 1; rem-= 1; } else if(rem == 0) { m = n; } args.initialprime = args.nextprime; args.nextprime = args.initialprime + m; pthread_create(&thread_id[h], null, ¶llel_launcher, (void *)&args); pthread_join(thread_id[h], null); } // printf("count: %d\n",k); return 0; }
note: os: fedora 21 x86_64, compiler: gcc-4.4, processor: intel core i5 (2 physical core, 4 logical), mem: 6 gb, hdd: 340 gb,
you need split range examining primes n parts, n number of threads.
the code each thread runs becomes:
typedef struct start_end { int start; int end; } start_end_t; int find_primes_in_range(void *in) { start_end_t *start_end = (start_end_t *) in; int num_primes = 0; (int j = start_end->start; j <= start_end->end; j++) { if (isprime(j) == 1) num_primes++; } pthread_exit((void *) num_primes; }
the main
routine first starts threads call find_primes_in_range
, calls pthread_join
each thread. sums values returned find_primes_in_range
. avoids locking , unlocking shared count variable.
this parallelize work, amount of work per thread not equal. can addressed more complicated.
Comments
Post a Comment