Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
More interface and doc changes
  • Loading branch information
inducer committed Jul 15, 2015
1 parent bcbb113 commit d60e3c1
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 49 deletions.
7 changes: 6 additions & 1 deletion doc/algorithm.rst
Expand Up @@ -271,4 +271,9 @@ Building many variable-size lists

.. autoclass:: ListOfListsBuilder

.. automethod:: __call__
Bitonic Sort
------------

.. module:: pyopencl.bitonic_sort

.. autoclass:: BitonicSort
3 changes: 3 additions & 0 deletions pyopencl/algorithm.py
Expand Up @@ -395,6 +395,7 @@ def _make_sort_scan_type(device, bits, index_dtype):


# {{{ driver

# import hoisted here to be used as a default argument in the constructor
from pyopencl.scan import GenericScanKernel

Expand All @@ -403,6 +404,8 @@ class RadixSort(object):
"""Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
on the compute device.
.. seealso:: :class:`pyopencl.algorithm.BitonicSort`
.. versionadded:: 2013.1
"""
def __init__(self, context, arguments, key_expr, sort_arg_names,
Expand Down
81 changes: 62 additions & 19 deletions pyopencl/bitonic_sort.py
Expand Up @@ -6,7 +6,7 @@
All rights reserved.
"""

# based on code at
# based on code at http://www.bealto.com/gpu-sorting_intro.html

__license__ = """
Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -37,13 +37,25 @@

import pyopencl as cl
from pyopencl.tools import dtype_to_ctype
from mako.template import Template
from operator import mul
from functools import reduce
from pytools import memoize_method
from mako.template import Template


def _is_power_of_2(n):
from pyopencl.tools import bitlog2
return n == 0 or 2**bitlog2(n) == n


class BitonicSort(object):
"""Sort an array (or one axis of one) using a sorting network.
Will only work if the axis of the array to be sorted has a length
that is a power of 2.
.. versionadded:: 2015.2
"""
def __init__(self, context, shape, key_dtype, idx_dtype=None, axis=0):
import pyopencl.bitonic_sort_templates as tmpl

Expand All @@ -65,31 +77,60 @@ def __init__(self, context, shape, key_dtype, idx_dtype=None, axis=0):
if idx_dtype is None:
self.argsort = 0
self.idx_t = 'uint' # Dummy

else:
self.argsort = 1
self.idx_t = dtype_to_ctype(idx_dtype)

self.defstpl = Template(tmpl.defines)
self.rq = self.sort_b_prepare_wl(shape, self.axis)
self.run_queue = self.sort_b_prepare_wl(shape, self.axis)

def __call__(self, arr, idx=None, mkcpy=True, queue=None, wait_for=None):
if queue is None:
queue = arr.queue

def __call__(self, _arr, idx=None, mkcpy=True):
arr = _arr.copy() if mkcpy else _arr
rq = self.rq
p, nt, wg, aux = rq[0]
if self.argsort and not type(idx)==type(None):
if wait_for is None:
wait_for = []
wait_for = wait_for + arr.events

last_evt = cl.enqueue_marker(queue, wait_for=wait_for)

if arr.shape[self.axis] == 0:
return arr, last_evt

if not _is_power_of_2(arr.shape[self.axis]):
raise ValueError("sorted array axis length must be a power of 2")

arr = arr.copy() if mkcpy else arr

run_queue = self.run_queue
knl, nt, wg, aux = run_queue[0]

if self.argsort and idx is not None:
if aux:
p.run(arr.queue, (nt,), wg, arr.data, idx.data, cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),\
cl.LocalMemory(wg[0]*4*idx.dtype.itemsize))
for p, nt, wg,_ in rq[1:]:
p.run(arr.queue, (nt,), wg, arr.data, idx.data)
elif self.argsort==0:
last_evt = knl(
queue, (nt,), wg, arr.data, idx.data,
cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),
cl.LocalMemory(wg[0]*4*idx.dtype.itemsize),
wait_for=[last_evt])
for knl, nt, wg, _ in run_queue[1:]:
last_evt = knl(
queue, (nt,), wg, arr.data, idx.data,
wait_for=[last_evt])

elif not self.argsort:
if aux:
p.run(arr.queue, (nt,), wg, arr.data, cl.LocalMemory(wg[0]*4*arr.dtype.itemsize))
for p, nt, wg,_ in rq[1:]:
p.run(arr.queue, (nt,), wg, arr.data)
last_evt = knl(
queue, (nt,), wg, arr.data,
cl.LocalMemory(wg[0]*4*arr.dtype.itemsize),
wait_for=[last_evt])
for knl, nt, wg, _ in run_queue[1:]:
last_evt = knl(queue, (nt,), wg, arr.data, wait_for=[last_evt])

else:
raise ValueError("Array of indexes required for this sorter. If argsort is not needed,\
recreate sorter witout index datatype provided.")
return arr
return arr, last_evt

@memoize_method
def get_program(self, letter, params):
Expand All @@ -102,7 +143,9 @@ def get_program(self, letter, params):
dsize=params[4], nsize=params[5])

self.cached_defs[params] = defs

kid = Template(self.kernels_srcs[letter]).render(argsort=self.argsort)

prg = cl.Program(self.context, defs + kid).build()
return prg

Expand All @@ -122,7 +165,7 @@ def sort_b_prepare_wl(self, shape, axis):
wg = min(ds, self.context.devices[0].max_work_group_size)
length = wg >> 1
prg = self.get_program('BLO', (1, 1, self.dtype, self.idx_t, ds, ns))
run_queue.append((prg, size, (wg,), True))
run_queue.append((prg.run, size, (wg,), True))

while length < ds:
inc = length
Expand All @@ -146,7 +189,7 @@ def sort_b_prepare_wl(self, shape, axis):

prg = self.get_program(letter,
(inc, direction, self.dtype, self.idx_t, ds, ns))
run_queue.append((prg, nthreads, None, False,))
run_queue.append((prg.run, nthreads, None, False,))
inc >>= ninc

length <<= 1
Expand Down
116 changes: 92 additions & 24 deletions pyopencl/bitonic_sort_templates.py
Expand Up @@ -3,18 +3,38 @@
Copyright (c) 2015, Ilya Efimoff
All rights reserved.
"""
__license__ = """
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
__license__ = """
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""


defines = """
# {{{ defines

defines = """//CL//
typedef ${dtype} data_t;
typedef ${idxtype} idx_t;
typedef ${idxtype}2 idx_t2;
Expand Down Expand Up @@ -67,9 +87,14 @@
#define dsize ${dsize} //Dimension size
"""

ParallelBitonic_B2 = """
# }}}


# {{{ B2

ParallelBitonic_B2 = """//CL//
// N/2 threads
//ParallelBitonic_B2
//ParallelBitonic_B2
__kernel void run(__global data_t * data\\
% if argsort:
, __global idx_t * index)
Expand Down Expand Up @@ -117,9 +142,14 @@
}
"""

ParallelBitonic_B4 = """
# }}}


# {{{ B4

ParallelBitonic_B4 = """//CL//
// N/4 threads
//ParallelBitonic_B4
//ParallelBitonic_B4
__kernel void run(__global data_t * data\\
% if argsort:
, __global idx_t * index)
Expand Down Expand Up @@ -179,9 +209,14 @@
}
"""

ParallelBitonic_B8 = """
# }}}


# {{{ B8

ParallelBitonic_B8 = """//CL//
// N/8 threads
//ParallelBitonic_B8
//ParallelBitonic_B8
__kernel void run(__global data_t * data\\
% if argsort:
, __global idx_t * index)
Expand Down Expand Up @@ -226,9 +261,14 @@
}
"""

ParallelBitonic_B16 = """
# }}}


# {{{ B16

ParallelBitonic_B16 = """//CL//
// N/16 threads
//ParallelBitonic_B16
//ParallelBitonic_B16
__kernel void run(__global data_t * data\\
% if argsort:
, __global idx_t * index)
Expand Down Expand Up @@ -273,8 +313,13 @@
}
"""

ParallelBitonic_C4 = """
//ParallelBitonic_C4
# }}}


# {{{ C4

ParallelBitonic_C4 = """//CL//
//ParallelBitonic_C4
__kernel void run\\
% if argsort:
(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
Expand Down Expand Up @@ -342,8 +387,12 @@
}
"""

# }}}


# {{{ local merge

ParallelMerge_Local = """
ParallelMerge_Local = """//CL//
// N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
{
Expand Down Expand Up @@ -386,7 +435,12 @@
}
"""

ParallelBitonic_Local = """
# }}}


# {{{

ParallelBitonic_Local = """//CL//
// N threads, WG is workgroup size. Sort WG input blocks in each workgroup.
__kernel void run(__global const data_t * in,__global data_t * out,__local data_t * aux)
{
Expand Down Expand Up @@ -426,7 +480,12 @@
}
"""

ParallelBitonic_A = """
# }}}


# {{{ A

ParallelBitonic_A = """//CL//
__kernel void ParallelBitonic_A(__global const data_t * in)
{
int i = get_global_id(0); // thread index
Expand All @@ -447,7 +506,12 @@
}
"""

ParallelBitonic_Local_Optim = """
# }}}


# {{{ local optim

ParallelBitonic_Local_Optim = """//CL//
__kernel void run\\
% if argsort:
(__global data_t * data, __global idx_t * index, __local data_t * aux, __local idx_t * auy)
Expand All @@ -464,7 +528,7 @@
// Move IN, OUT to block start
//int offset = get_group_id(0) * wg;
data += offset;
data += offset;
// Load block in AUX[WG]
data_t iData = data[t*nsize];
aux[i] = iData;
Expand Down Expand Up @@ -513,4 +577,8 @@
index[t*nsize] = iidx;
% endif
}
"""
"""

# }}}

# vim: filetype=pyopencl:fdm=marker

0 comments on commit d60e3c1

Please sign in to comment.