Monday, June 22, 2009

Multithreading other sort routines






With all that infrastructure for writing multithreading sorts available (see my last few posts), it's not difficult to write multithreaded versions of other sorting algorithms. This post will cover multithreaded versions of combsort and shellsort. I can warn in advance that... they suck. I've graphed them running on one, two, three and four cores, against one- (and sometimes two-) core classic Quicksort. They don't do very well at all.

It's easy to see how to parallelize shellsort and combsort. Both algorithms use a series of "gaps" (or "intervals"). For each gap, g shellsort fully sorts each "slice" of elements with indexes separated by g, and combsort partially sorts each "slice". For each gap, g, each of the g separate slices can be processed independently. But they're slow. The examples I provide here could be tuned better.

You can see how they might be tuned better by looking at, for example, the fourth graph, showing a breakdown of the time spent for each combsort pass. There, it is clear that an insertion sort should be used much sooner, say when the gap reaches 17. But that wouldn't really improve things much (my own tests suggest by about 17%, for sorting 9 million integers on 4 cores). Shellsort and Combsort just do not parallelize well in practice. Shellsort in particular parallelizes poorly.

There are two versions of Combsort shown here, because the stock-standard Combsort has truly awful performance on average (and a very bad worst-case). A slightly different version that alternates between left-to-right and right-to-left scans has a markedly better average case (and a way better worst case).

I'll start with a parallel insertion sort (!), because that's how you finish off a parallel combsort or a parallel shell sort. This implementation is only worth calling if the array is "nearly" sorted, as is the case "at the tail end" of a shellSort or a combSort.

template <class T>   void parallelInsertionSort(T *base, int elementCount, int processorCount, int multiThreadingThreshold=50000
                         , INotifiable *notifyMe=NULL)
{
  CountDown task;
  if (elementCount<processorCount*64 || processorCount<2)
  {
    insertionSort(base, elementCount);
  }
  else
  {
    task += processorCount;
    for (int p=0;p<processorCount;++p)
    {
      int i = (elementCount * p     ) / processorCount;
      int j = (elementCount * (p+1) ) / processorCount;
      gpDispatcher->queueOrDo( NewNotifyingAction(CallAction2(insertionSort<T>,base+i,j-i), &task ) );
    }
    task.wait();
    for (int p=1;p<processorCount;++p)
    {
      int i = (elementCount * p     ) / processorCount;
      int k = (elementCount * (p+1) ) / processorCount;
      T v;
      //insert the elements in the range [i] through [k-1]
      //into the range [0] through [i-1], given that the
      //two ranges, [0..(i-1)] and [i..(k-1)] are 
      //independently sorted.
      for (int j=i;j<k;++j)
      {
        v = base[j];
        int h;
        for (h=j;h>0;--h)
          if (v<base[h-1])
            base[h] = base[h-1];
          else
            break;
        if (h==j)
        {
          //as soon as the element that *was* at [i-1]
          //is placed correctly, it is not necessary to
          //look at [(i+1)..(k-1)].  They're placed
          //where they should be already.
          break;
        }
        base[h] = v;
      }
    }    
  }
  if (notifyMe!=NULL)
    notifyMe->notify();
}

Now for the parallel shellsort:

template <class T> void shellSortSlices(T *base, int elementCount, int step, int firstChain, int lastChain)
{
  for (int i=firstChain+step;i<elementCount;i+=step)
  {
    int k=i + lastChain - firstChain;
    if (elementCount<=k) 
    {
      k=elementCount-1;
    }
    for (int j=i;j<=k;++j)
    {
      int h;
      T v = base[j];
      for (h=j;h>=step;h-=step)
        if (v<base[h-step]) 
          base[h]=base[h-step];
        else
          break;
      base[h] = v;
    }
  }
}

template <class T>   void parallelShellSort(T *base, int elementCount, int processorCount, int multiThreadingThreshold)
{      
  int gaps[] = { 5,13,37,97,251,631,1579,3947,9871,24677,61703,154267,385709,964283,2410711,
          6026827,15067067,37667713,94169297,235423249,588558169,1471395493 };
  int f;
  for (f=0; gaps[f]<elementCount; f++);
  f--;  
  CountDown task;
  for (;f>=0;f--)
  {
    int g = gaps[f];
    task += processorCount;
    for (int p=0; p<processorCount; p++)
    {
      int i =  p*g / processorCount;
      int j = (p+1)*g / processorCount - 1;
      gpDispatcher->queueOrDo ( NewNotifyingAction(CallAction5(shellSortSlices<T>, base, elementCount, g, i, j), &task));
    }        
    task.wait();
  }
  parallelInsertionSort(base, elementCount, processorCount, multiThreadingThreshold, &task);  
}

And the parallel combsort:


int alternatingGaps[] = {2,3,5,7,11,17,23,37,53,79,113,163,229,331,463,653,
  919,1289,1811,2539,3557,4987,6983,9781,13693,19181,26861,37607,52667,
  73751,103289,144611,202471,283463,396871,555637,777901,1089091,1524763,
  2134697,2988607,4184087,5857727,8200847,11481199,16073693,22503181,31504453,
  44106241,61748749,86448259,121027561,169438579,237214009,332099617,464939459,
  650915219,911281291,1275793807,1786111267};
  //These are primes near powers of P=1.4
  //Alternating combsort can make do with slightly fewer passes than
  //vanilla combsort, as it does better at moving elements that belong
  //low, low.  Vanilla combsort is nowhere near as good at that.
  //Vanilla combsort moves elements that belong high, high, very efficiently,
  //and "hopes the values that belong low take care of themselves".
  //But they don't.

int combSortIntervals[] = {
  2,3,5,7,11,17,23,29,37,53,71,97,127,167,223,293,383,499,653,853,
  1109,1447,1889,2459,3203,4177,5431,7069,9199,11959,15551,20219,  
  26293,34183,44449,57787,75133,97673,126989,165089,214631,279023,
  362741,471571,613049,796967,1036067,1346899,1750979,2276293,2959183,
  3846943,5001049,6501367,8451791,10987343,14283559,18568637,
  24139231,31381043,40795357,53033969,68944157,89627413,116515667,
  151470353,196911461,255984881,332780323,432614407,562398721,731118341,
  950453827,1235589917,1606266791,2088146713 };
  //These are the primes near powers of P=1.3
  //Combsort runs very slightly faster, on average, if all the 
  //intervals are prime.

template <class T> void combPassSlices(T *base, int elementCount, int step, int firstChain, int lastChain)
{
  T v;
  for (int i=firstChain+step;i<elementCount;i+=step)
  {
    int k=i + lastChain - firstChain;
    if (elementCount<=k) 
      k=elementCount-1;
    for (int j=i;j<=k;++j)
    {
      if (base[j]<base[j-step])
      {
        v = base[j];
        base[j] = base[j-step];
        base[j-step] = v;
      }
    }
  }
}

template <class T> void downwardCombPassSlices(T *base, int elementCount, int step, int firstChain, int lastChain)
{
  T v;
  int h,i,j;
  for (j=elementCount-firstChain-1;j>=step;j-=step)
  {
    h = j + firstChain - lastChain;
    if (h<step) 
    {
      h=step;
    }
    for (i=j;i>=h;--i)
    {
      if (base[i]<base[i-step])
      {
        v = base[i];
        base[i] = base[i-step];
        base[i-step] = v;
      }
    }
  }
}

template <class T>   void parallelCombSort(T *base, int elementCount, int processorCount, int multiThreadingThreshold, bool alternating)
{    
  int *gaps = combSortIntervals;
  if (alternating)
    gaps = alternatingGaps;

  int f;
  for (f=0; gaps[f]<elementCount; f++);
  f--;  
  CountDown task;
  for (;f>=0;f--)
  {
    int g = gaps[f];
    task += processorCount;
    for (int p=0; p<processorCount; p++)
    {
      int i =  p*g / processorCount;
      int j = (p+1)*g / processorCount - 1;
      if (alternating && (f&1)!=0)
        gpDispatcher->queueOrDo (NewNotifyingAction(CallAction5(downwardCombPassSlices<T>, base, elementCount, g, i, j), &task));
      else
        gpDispatcher->queueOrDo (NewNotifyingAction(CallAction5(combPassSlices<T>, base, elementCount, g, i, j), &task));
    }        
    task.wait();
  }
  parallelInsertionSort(base, elementCount, processorCount, multiThreadingThreshold);  
}

No comments:

Post a Comment