Navigation Menu

Skip to content

Commit

Permalink
Merge stout:src/pyopencl
Browse files Browse the repository at this point in the history
  • Loading branch information
inducer committed May 11, 2016
2 parents 52c8c83 + 9ff9ce1 commit 6a18d75
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 36 deletions.
15 changes: 5 additions & 10 deletions examples/dump-performance.py
@@ -1,21 +1,18 @@
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division, absolute_import, print_function
import pyopencl as cl
import pyopencl.characterize.performance as perf
from six.moves import range




def main():
ctx = cl.create_some_context()

prof_overhead, latency = perf.get_profiling_overhead(ctx)
print("command latency: %g s" % latency)
print("profiling overhead: %g s -> %.1f %%" % (
prof_overhead, 100*prof_overhead/latency))
queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
queue = cl.CommandQueue(
ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9))
Expand All @@ -29,13 +26,11 @@ def main():
print("----------------------------------------")

print("latency: %g s" % perf.transfer_latency(queue, tx_type))
for i in range(6, 28, 2):
bs = 1<<i
for i in range(6, 31, 2):
bs = 1 << i
print("bandwidth @ %d bytes: %g GB/s" % (
bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))




if __name__ == "__main__":
main()
43 changes: 17 additions & 26 deletions pyopencl/characterize/performance.py
@@ -1,7 +1,4 @@
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
from six.moves import range
from __future__ import division, absolute_import, print_function

__copyright__ = "Copyright (C) 2009 Andreas Kloeckner"

Expand All @@ -25,12 +22,11 @@
THE SOFTWARE.
"""

from six.moves import range
import pyopencl as cl
import numpy as np




# {{{ timing helpers

class Timer:
Expand All @@ -50,8 +46,6 @@ def get_elapsed(self):
pass




class WallTimer(Timer):
def start(self):
from time import time
Expand All @@ -67,8 +61,6 @@ def get_elapsed(self):
return self.end-self.start




def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
warmup_rounds=3):

Expand Down Expand Up @@ -106,8 +98,6 @@ def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
# }}}




# {{{ transfer measurements

class HostDeviceTransferBase(object):
Expand All @@ -116,32 +106,33 @@ def __init__(self, queue, block_size):
self.host_buf = np.empty(block_size, dtype=np.uint8)
self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)


class HostToDeviceTransfer(HostDeviceTransferBase):
def do(self):
return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)


class DeviceToHostTransfer(HostDeviceTransferBase):
def do(self):
return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)


class DeviceToDeviceTransfer(object):
def __init__(self, queue, block_size):
self.queue = queue
self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
mf = cl.mem_flags
self.dev_buf_1 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)
self.dev_buf_2 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)

def do(self):
return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1)

class HostToDeviceTransfer(HostDeviceTransferBase):
def do(self):
return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)


def transfer_latency(queue, transfer_type, timer_factory=None):
transfer = transfer_type(queue, 1)
return _get_time(queue, transfer.do, timer_factory=timer_factory)


def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
"""Measures one-sided bandwidth."""

Expand All @@ -151,8 +142,6 @@ def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
# }}}




def get_profiling_overhead(ctx, timer_factory=None):
no_prof_queue = cl.CommandQueue(ctx)
transfer = DeviceToDeviceTransfer(no_prof_queue, 1)
Expand All @@ -165,6 +154,7 @@ def get_profiling_overhead(ctx, timer_factory=None):

return prof_time - no_prof_time, prof_time


def get_empty_kernel_time(queue, timer_factory=None):
prg = cl.Program(queue.context, """
__kernel void empty()
Expand All @@ -178,13 +168,16 @@ def f():

return _get_time(queue, f, timer_factory=timer_factory)

def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None):

def _get_full_machine_kernel_rate(queue, src, args, name="benchmark",
timer_factory=None):
prg = cl.Program(queue.context, src).build()

knl = getattr(prg, name)

dev = queue.device
global_size = 4 * dev.max_compute_units

def f():
knl(queue, (global_size,), None, *args)

Expand All @@ -198,22 +191,22 @@ def f():

keep_trying = not rates

if rates and rate > 1.05*max(rates): # big improvement
if rates and rate > 1.05*max(rates): # big improvement
keep_trying = True
num_dips = 0

if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip
if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip
keep_trying = True
num_dips += 1

if keep_trying:
global_size *= 2
last_rate = rate
rates.append(rate)
else:
rates.append(rate)
return max(rates)


def get_add_rate(queue, type="float", timer_factory=None):
return 50*10*_get_full_machine_kernel_rate(queue, """
typedef %(op_t)s op_t;
Expand Down Expand Up @@ -244,6 +237,4 @@ def get_add_rate(queue, type="float", timer_factory=None):
""" % dict(op_t=type), ())




# vim: foldmethod=marker:filetype=pyopencl

0 comments on commit 6a18d75

Please sign in to comment.