From 9874d9cc376ccfe4c4cc41ecd05b87fab6809715 Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 20 Feb 2025 17:20:05 -0600 Subject: [PATCH 01/18] Add a vector constant of the appropriate size. --- loopy/expression.py | 9 ++++----- test/test_target.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/loopy/expression.py b/loopy/expression.py index 5a11b8354..881e41ac6 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -22,7 +22,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, +) import numpy as np @@ -162,10 +164,7 @@ def map_constant(self, expr: object) -> bool: def map_variable(self, expr: p.Variable) -> bool: if expr.name == self.vec_iname: - # Technically, this is doable. But we're not going there. - raise UnvectorizableError() - - # A single variable is always a scalar. + return True return False map_tagged_variable = map_variable diff --git a/test/test_target.py b/test/test_target.py index fe2ad1d8a..5e9014f0a 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -875,6 +875,36 @@ def test_float3(): assert "float3" in device_code +def test_cl_vectorize_index_variable(ctx_factory): + knl = lp.make_kernel( + "{ [i]: 0<=i0") + + rng = np.random.default_rng(seed=12) + a = rng.normal(size=(16, 4)) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64}) + _evt, (result,) = knl(queue, a=a, n=a.size) + + result_ref = np.zeros(a.shape, dtype=np.float64) + for i in range(16): + for j in range(4): + ind = i*4 + j + result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j]) + + assert np.allclose(result, result_ref) + + if __name__ == "__main__": import sys if len(sys.argv) > 1: From 2bf9c8bf366b134dc12315431b34e14736c5bdc7 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 13:52:31 -0600 Subject: [PATCH 02/18] Move the OpenCL specifics to an OpenCL mapper. Modify typecast for vector types. --- loopy/target/c/codegen/expression.py | 41 +++++++++++++++++++++++++++- loopy/target/opencl.py | 24 ++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 83c13dfe5..03ce925bc 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -130,6 +130,12 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s): if actual_type != needed_type: registry = self.codegen_state.ast_builder.target.get_dtype_registry() cast = var("(%s) " % registry.dtype_to_ctype(needed_type)) + if self.codegen_state.target.is_vector_dtype(needed_type): + # OpenCL does not let you do explicit vector type casts. + # Instead you need to call their function which is of the form + # convert_(src) where desttype is the type you want and n + # is the number of elements in the vector which is the same as in src. + cast = var("convert_%s" % registry.dtype_to_ctype(needed_type)) return cast(s) return s @@ -414,9 +420,37 @@ def map_remainder(self, expr, type_context): def map_if(self, expr, type_context): from loopy.types import to_loopy_type result_type = self.infer_type(expr) + conditional_needed_loopy_type = to_loopy_type(np.bool_) + if self.codegen_state.vectorization_info: + from loopy.expression import VectorizabilityChecker + from loopy.codegen import UnvectorizableError + checker = VectorizabilityChecker(self.codegen_state.kernel, + self.codegen_state.vectorization_info.iname, + self.codegen_state.vectorization_info.length) + + try: + is_vector = checker(expr) + + if is_vector: + """ + We could have a vector literal here. + So we may need to type cast the condition. + OpenCL specification states that for ( c ? a : b) + to be vectorized appropriately c must have the same + number of elements in the vector as that of a and b. + Also each element must have the same number of bits, + and c must be an integral type. + """ + index_type = to_loopy_type(np.int64) + if type_context == "f": + index_type = to_loopy_type(np.int32) + conditional_needed_loopy_type = to_loopy_type(self.codegen_state.target.vector_dtype(index_type, + self.codegen_state.vectorization_info.length)) + except UnvectorizableError: + pass return type(expr)( self.rec(expr.condition, type_context, - to_loopy_type(np.bool_)), + conditional_needed_loopy_type), self.rec(expr.then, type_context, result_type), self.rec(expr.else_, type_context, result_type), ) @@ -712,8 +746,13 @@ def map_min(self, expr, enclosing_prec): map_max = map_min + def map_type_cast(self, expr, enclosing_prec): + breakpoint() + return super().map_type_cast(expr, enclosing_prec) + def map_if(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE + breakpoint() return "({} ? {} : {})".format( # Force parentheses around the condition to prevent compiler # warnings regarding precedence (e.g. with POCL 1.8/LLVM 12): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 07c5b49d0..c6266b341 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -545,6 +545,15 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): from pymbolic.primitives import Comparison return Comparison(s, "!=", 0) + registry = self.codegen_state.ast_builder.target.get_dtype_registry() + if self.codegen_state.target.is_vector_dtype(needed_dtype): + # OpenCL does not let you do explicit vector type casts. + # Instead you need to call their function which is of the form + # convert_(src) where desttype is the type you want and n + # is the number of elements in the vector which is the same as in src. + cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) + return cast(s) + return super().wrap_in_typecast(actual_type, needed_dtype, s) def map_group_hw_index(self, expr, type_context): @@ -553,6 +562,21 @@ def map_group_hw_index(self, expr, type_context): def map_local_hw_index(self, expr, type_context): return var("lid")(expr.axis) + def map_variable(self, expr, type_context): + + if self.codegen_state.vectorization_info: + if self.codegen_state.vectorization_info.iname == expr.name: + # This needs to be converted into a vector literal. + from loopy.symbolic import Literal + vector_length = self.codegen_state.vectorization_info.length + index_type = self.codegen_state.kernel.index_dtype + vector_type = self.codegen_state.target.vector_dtype(index_type, + vector_length) + typecast = self.codegen_state.target.dtype_to_typename(vector_type) + vector_literal = f"(({typecast})" + " (" + \ + ",".join([f"{i}" for i in range(vector_length)]) + "))" + return Literal(vector_literal) + return super().map_variable(expr, type_context) # }}} From 133055a4aff226c95580afa85a6b3d24c1438fdd Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 13:58:04 -0600 Subject: [PATCH 03/18] Remove unused code. --- loopy/target/c/codegen/expression.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 03ce925bc..c4cf480ba 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -130,14 +130,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s): if actual_type != needed_type: registry = self.codegen_state.ast_builder.target.get_dtype_registry() cast = var("(%s) " % registry.dtype_to_ctype(needed_type)) - if self.codegen_state.target.is_vector_dtype(needed_type): - # OpenCL does not let you do explicit vector type casts. - # Instead you need to call their function which is of the form - # convert_(src) where desttype is the type you want and n - # is the number of elements in the vector which is the same as in src. - cast = var("convert_%s" % registry.dtype_to_ctype(needed_type)) - return cast(s) + return cast(s) return s def rec(self, expr, type_context=None, needed_type: LoopyType | None = None): # type: ignore[override] @@ -422,8 +416,8 @@ def map_if(self, expr, type_context): result_type = self.infer_type(expr) conditional_needed_loopy_type = to_loopy_type(np.bool_) if self.codegen_state.vectorization_info: - from loopy.expression import VectorizabilityChecker from loopy.codegen import UnvectorizableError + from loopy.expression import VectorizabilityChecker checker = VectorizabilityChecker(self.codegen_state.kernel, self.codegen_state.vectorization_info.iname, self.codegen_state.vectorization_info.length) @@ -746,13 +740,8 @@ def map_min(self, expr, enclosing_prec): map_max = map_min - def map_type_cast(self, expr, enclosing_prec): - breakpoint() - return super().map_type_cast(expr, enclosing_prec) - def map_if(self, expr, enclosing_prec): from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE - breakpoint() return "({} ? {} : {})".format( # Force parentheses around the condition to prevent compiler # warnings regarding precedence (e.g. with POCL 1.8/LLVM 12): From a1f960638c311584c0504fa08542bdbba884c2e6 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 14:21:34 -0600 Subject: [PATCH 04/18] Modify the typecast for vector dtypes. --- loopy/target/c/codegen/expression.py | 32 ++-------------------- loopy/target/opencl.py | 41 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index c4cf480ba..83c13dfe5 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -130,8 +130,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s): if actual_type != needed_type: registry = self.codegen_state.ast_builder.target.get_dtype_registry() cast = var("(%s) " % registry.dtype_to_ctype(needed_type)) - return cast(s) + return s def rec(self, expr, type_context=None, needed_type: LoopyType | None = None): # type: ignore[override] @@ -414,37 +414,9 @@ def map_remainder(self, expr, type_context): def map_if(self, expr, type_context): from loopy.types import to_loopy_type result_type = self.infer_type(expr) - conditional_needed_loopy_type = to_loopy_type(np.bool_) - if self.codegen_state.vectorization_info: - from loopy.codegen import UnvectorizableError - from loopy.expression import VectorizabilityChecker - checker = VectorizabilityChecker(self.codegen_state.kernel, - self.codegen_state.vectorization_info.iname, - self.codegen_state.vectorization_info.length) - - try: - is_vector = checker(expr) - - if is_vector: - """ - We could have a vector literal here. - So we may need to type cast the condition. - OpenCL specification states that for ( c ? a : b) - to be vectorized appropriately c must have the same - number of elements in the vector as that of a and b. - Also each element must have the same number of bits, - and c must be an integral type. - """ - index_type = to_loopy_type(np.int64) - if type_context == "f": - index_type = to_loopy_type(np.int32) - conditional_needed_loopy_type = to_loopy_type(self.codegen_state.target.vector_dtype(index_type, - self.codegen_state.vectorization_info.length)) - except UnvectorizableError: - pass return type(expr)( self.rec(expr.condition, type_context, - conditional_needed_loopy_type), + to_loopy_type(np.bool_)), self.rec(expr.then, type_context, result_type), self.rec(expr.else_, type_context, result_type), ) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index c6266b341..c4453227d 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -577,6 +577,47 @@ def map_variable(self, expr, type_context): ",".join([f"{i}" for i in range(vector_length)]) + "))" return Literal(vector_literal) return super().map_variable(expr, type_context) + + def map_if(self, expr, type_context): + from loopy.types import to_loopy_type + result_type = self.infer_type(expr) + conditional_needed_loopy_type = to_loopy_type(np.bool_) + if self.codegen_state.vectorization_info: + from loopy.codegen import UnvectorizableError + from loopy.expression import VectorizabilityChecker + checker = VectorizabilityChecker(self.codegen_state.kernel, + self.codegen_state.vectorization_info.iname, + self.codegen_state.vectorization_info.length) + + try: + is_vector = checker(expr) + + if is_vector: + """ + We could have a vector literal here. + So we may need to type cast the condition. + OpenCL specification states that for ( c ? a : b) + to be vectorized appropriately c must have the same + number of elements in the vector as that of a and b. + Also each element must have the same number of bits, + and c must be an integral type. + """ + index_type = to_loopy_type(np.int64) + if type_context == "f": + index_type = to_loopy_type(np.int32) + length = self.codegen_state.vectorization_info.length + vector_type = self.codegen_state.target.vector_dtype(index_type, + length) + conditional_needed_loopy_type = to_loopy_type(vector_type) + except UnvectorizableError: + pass + + return type(expr)( + self.rec(expr.condition, type_context, + conditional_needed_loopy_type), + self.rec(expr.then, type_context, result_type), + self.rec(expr.else_, type_context, result_type), + ) # }}} From 65766ba0fb1934fd848397a2ba004a9ebfb57c63 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 15:05:38 -0600 Subject: [PATCH 05/18] Just return the value if we do not need to typecast. --- loopy/target/opencl.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index c4453227d..b57747e54 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -46,6 +46,7 @@ from loopy.codegen import CodeGenerationState from loopy.codegen.result import CodeGenerationResult + from loopy.kernel import LoopKernel # {{{ dtype registry wrappers @@ -456,7 +457,7 @@ def get_opencl_callables(): # {{{ symbol mangler -def opencl_symbol_mangler(kernel, name): +def opencl_symbol_mangler(kernel: LoopKernel, name: str): # FIXME: should be more picky about exact names if name.startswith("FLT_"): return NumpyType(np.dtype(np.float32)), name @@ -544,25 +545,29 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): # CL does not perform implicit conversion from float-type to a bool. from pymbolic.primitives import Comparison return Comparison(s, "!=", 0) + + if needed_dtype == actual_type: + return s registry = self.codegen_state.ast_builder.target.get_dtype_registry() - if self.codegen_state.target.is_vector_dtype(needed_dtype): - # OpenCL does not let you do explicit vector type casts. - # Instead you need to call their function which is of the form - # convert_(src) where desttype is the type you want and n + if self.codegen_state.target.is_vector_dtype(needed_dtype) and \ + self.codegen_state.target.is_vector_dtype(actual_type): + # OpenCL does not let you do explicit vector type casts between vector + # types. Instead you need to call their function which is of the form + # convert_(src) where n # is the number of elements in the vector which is the same as in src. cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) return cast(s) return super().wrap_in_typecast(actual_type, needed_dtype, s) - def map_group_hw_index(self, expr, type_context): + def map_group_hw_index(self, expr, type_context: str): return var("gid")(expr.axis) - def map_local_hw_index(self, expr, type_context): + def map_local_hw_index(self, expr, type_context: str): return var("lid")(expr.axis) - def map_variable(self, expr, type_context): + def map_variable(self, expr, type_context: str): if self.codegen_state.vectorization_info: if self.codegen_state.vectorization_info.iname == expr.name: @@ -578,7 +583,7 @@ def map_variable(self, expr, type_context): return Literal(vector_literal) return super().map_variable(expr, type_context) - def map_if(self, expr, type_context): + def map_if(self, expr, type_context: str): from loopy.types import to_loopy_type result_type = self.infer_type(expr) conditional_needed_loopy_type = to_loopy_type(np.bool_) From 3061cc69da25e989b5dfe391662179124b000852 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 15:06:06 -0600 Subject: [PATCH 06/18] Ruff. --- loopy/target/opencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index b57747e54..67c5a5634 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -545,7 +545,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): # CL does not perform implicit conversion from float-type to a bool. from pymbolic.primitives import Comparison return Comparison(s, "!=", 0) - + if needed_dtype == actual_type: return s From af7d947ba5f8c221a8f400a7af3ba4876d499297 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 15:21:37 -0600 Subject: [PATCH 07/18] Use the convert function only if we are converting from a boolean or a previously vectorized dtype. --- loopy/target/opencl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 67c5a5634..b27a15384 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -550,14 +550,15 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): return s registry = self.codegen_state.ast_builder.target.get_dtype_registry() - if self.codegen_state.target.is_vector_dtype(needed_dtype) and \ - self.codegen_state.target.is_vector_dtype(actual_type): + if self.codegen_state.target.is_vector_dtype(needed_dtype): # OpenCL does not let you do explicit vector type casts between vector # types. Instead you need to call their function which is of the form # convert_(src) where n # is the number of elements in the vector which is the same as in src. - cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) - return cast(s) + if self.codegen_state.target.is_vector_dtype(actual_type) or \ + actual_type.dtype.kind == "b": + cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) + return cast(s) return super().wrap_in_typecast(actual_type, needed_dtype, s) From 141d9d72fc2161b16ec028988e9669265d643c77 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 15:38:42 -0600 Subject: [PATCH 08/18] Mypy changes. --- loopy/target/opencl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index b27a15384..121135662 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -562,13 +562,13 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): return super().wrap_in_typecast(actual_type, needed_dtype, s) - def map_group_hw_index(self, expr, type_context: str): + def map_group_hw_index(self, expr, type_context): return var("gid")(expr.axis) - def map_local_hw_index(self, expr, type_context: str): + def map_local_hw_index(self, expr, type_context): return var("lid")(expr.axis) - def map_variable(self, expr, type_context: str): + def map_variable(self, expr, type_context): if self.codegen_state.vectorization_info: if self.codegen_state.vectorization_info.iname == expr.name: @@ -584,7 +584,7 @@ def map_variable(self, expr, type_context: str): return Literal(vector_literal) return super().map_variable(expr, type_context) - def map_if(self, expr, type_context: str): + def map_if(self, expr, type_context): from loopy.types import to_loopy_type result_type = self.infer_type(expr) conditional_needed_loopy_type = to_loopy_type(np.bool_) From 7d95c97602533938e4ec6274f38a9cf1747bab55 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 24 Feb 2025 20:40:58 -0600 Subject: [PATCH 09/18] Respond to comments. --- loopy/expression.py | 1 + loopy/target/opencl.py | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/loopy/expression.py b/loopy/expression.py index 881e41ac6..3a87dbe61 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -164,6 +164,7 @@ def map_constant(self, expr: object) -> bool: def map_variable(self, expr: p.Variable) -> bool: if expr.name == self.vec_iname: + # Technically, this is doable. return True return False diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 121135662..d58aecc13 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -457,7 +457,8 @@ def get_opencl_callables(): # {{{ symbol mangler -def opencl_symbol_mangler(kernel: LoopKernel, name: str): +def opencl_symbol_mangler(kernel: LoopKernel, + name: str) -> tuple[NumpyType, str] | None: # FIXME: should be more picky about exact names if name.startswith("FLT_"): return NumpyType(np.dtype(np.float32)), name @@ -555,6 +556,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): # types. Instead you need to call their function which is of the form # convert_(src) where n # is the number of elements in the vector which is the same as in src. + # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts if self.codegen_state.target.is_vector_dtype(actual_type) or \ actual_type.dtype.kind == "b": cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) @@ -600,18 +602,25 @@ def map_if(self, expr, type_context): if is_vector: """ - We could have a vector literal here. - So we may need to type cast the condition. - OpenCL specification states that for ( c ? a : b) - to be vectorized appropriately c must have the same - number of elements in the vector as that of a and b. - Also each element must have the same number of bits, - and c must be an integral type. + We could have a vector literal here which may need to be + converted to an appropriate size. The OpenCL specification states + that for ( c ? a : b) a, b, and c must have the same + number of elements and bits and that c must be an integral type. + https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#table-builtin-relational """ - index_type = to_loopy_type(np.int64) - if type_context == "f": - index_type = to_loopy_type(np.int32) + index_type = to_loopy_type(self.codegen_state.kernel.index_dtype) + types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32), + 2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)} length = self.codegen_state.vectorization_info.length + if index_type.itemsize != result_type.itemsize and \ + result_type.itemsize in types.keys(): + # Need to convert index type into result type size. + # Item size is measured in bytes. + index_type = types[result_type.itemsize] + elif index_type.itemsize * length != result_type.itemsize and \ + (result_type.itemsize // length) in types.keys(): + + index_type = types[result_type.itemsize // length] vector_type = self.codegen_state.target.vector_dtype(index_type, length) conditional_needed_loopy_type = to_loopy_type(vector_type) From 144b3df224f7bbd523a9f8ef162b696016abe8d7 Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 27 Feb 2025 10:27:22 -0600 Subject: [PATCH 10/18] Respond to Andreas's comments. --- loopy/expression.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/loopy/expression.py b/loopy/expression.py index 3a87dbe61..3c2b6755d 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -22,9 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from typing import ( - TYPE_CHECKING, -) +from typing import TYPE_CHECKING import numpy as np @@ -166,6 +164,7 @@ def map_variable(self, expr: p.Variable) -> bool: if expr.name == self.vec_iname: # Technically, this is doable. return True + # A single variable is always a scalar. return False map_tagged_variable = map_variable From e261f65ae8b73461bf9a2bd7a74cc2f8b3e626bb Mon Sep 17 00:00:00 2001 From: Nick Date: Tue, 11 Mar 2025 10:53:37 -0500 Subject: [PATCH 11/18] Responding to comments. --- loopy/expression.py | 1 - loopy/target/opencl.py | 23 ++++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/loopy/expression.py b/loopy/expression.py index 3c2b6755d..a00e4c7f0 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -162,7 +162,6 @@ def map_constant(self, expr: object) -> bool: def map_variable(self, expr: p.Variable) -> bool: if expr.name == self.vec_iname: - # Technically, this is doable. return True # A single variable is always a scalar. return False diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d58aecc13..1b244412d 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -24,6 +24,7 @@ THE SOFTWARE. """ +from contextlib import suppress from typing import TYPE_CHECKING, Literal, Sequence import numpy as np @@ -557,8 +558,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): # convert_(src) where n # is the number of elements in the vector which is the same as in src. # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts - if self.codegen_state.target.is_vector_dtype(actual_type) or \ - actual_type.dtype.kind == "b": + if self.codegen_state.target.is_vector_dtype(actual_type): cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) return cast(s) @@ -580,8 +580,8 @@ def map_variable(self, expr, type_context): index_type = self.codegen_state.kernel.index_dtype vector_type = self.codegen_state.target.vector_dtype(index_type, vector_length) - typecast = self.codegen_state.target.dtype_to_typename(vector_type) - vector_literal = f"(({typecast})" + " (" + \ + typename = self.codegen_state.target.dtype_to_typename(vector_type) + vector_literal = f"(({typename})" + " (" + \ ",".join([f"{i}" for i in range(vector_length)]) + "))" return Literal(vector_literal) return super().map_variable(expr, type_context) @@ -597,7 +597,10 @@ def map_if(self, expr, type_context): self.codegen_state.vectorization_info.iname, self.codegen_state.vectorization_info.length) - try: + with suppress(UnvectorizableError): + # We know there is an expression in codegen which can be vectorized. + # We are checking if this is one of the them. If it is not, then we can + # just continue with scalar code generation for this expression. is_vector = checker(expr) if is_vector: @@ -612,20 +615,18 @@ def map_if(self, expr, type_context): types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32), 2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)} length = self.codegen_state.vectorization_info.length - if index_type.itemsize != result_type.itemsize and \ - result_type.itemsize in types.keys(): + if (index_type.itemsize != result_type.itemsize and + result_type.itemsize in types): # Need to convert index type into result type size. # Item size is measured in bytes. index_type = types[result_type.itemsize] - elif index_type.itemsize * length != result_type.itemsize and \ - (result_type.itemsize // length) in types.keys(): + elif (index_type.itemsize * length != result_type.itemsize and + (result_type.itemsize // length) in types): index_type = types[result_type.itemsize // length] vector_type = self.codegen_state.target.vector_dtype(index_type, length) conditional_needed_loopy_type = to_loopy_type(vector_type) - except UnvectorizableError: - pass return type(expr)( self.rec(expr.condition, type_context, From 89474ea18af77b910ab947f3482850e6a51e6482 Mon Sep 17 00:00:00 2001 From: Nick Date: Tue, 11 Mar 2025 14:08:21 -0500 Subject: [PATCH 12/18] Use the numpy vectorized version. --- test/test_target.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/test_target.py b/test/test_target.py index 5e9014f0a..9231ffe78 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -896,11 +896,10 @@ def test_cl_vectorize_index_variable(ctx_factory): knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64}) _evt, (result,) = knl(queue, a=a, n=a.size) - result_ref = np.zeros(a.shape, dtype=np.float64) - for i in range(16): - for j in range(4): - ind = i*4 + j - result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j]) + i = np.arange(16) + j = np.arange(4) + ind = 4*i[:,None] + j + result_ref = np.where(ind < 32, a*3, np.sin(a)) assert np.allclose(result, result_ref) From 1b9334de6e28c1ba6a1ba9efb583ec005fb6c610 Mon Sep 17 00:00:00 2001 From: Nick Date: Wed, 12 Mar 2025 17:13:27 -0500 Subject: [PATCH 13/18] Add typed literal. --- loopy/symbolic.py | 15 +++++++++++++++ loopy/target/opencl.py | 35 +++++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 20ff55fea..2bdd8db70 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -606,6 +606,21 @@ class Literal(LoopyExpressionBase): s: str +@p.expr_dataclass() +class TypedLiteral(Literal): + """A literal to be used during code generation which we know the type of. + + .. note:: + + Only used in the output of + :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and + similar mappers). Not for use in Loopy source representation. + """ + + s: str + dtype: ToLoopyTypeConvertible + + @p.expr_dataclass() class ArrayLiteral(LoopyExpressionBase): """An array literal. diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 1b244412d..524ed7e6e 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -543,6 +543,7 @@ def opencl_preamble_generator(preamble_info): class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper): def wrap_in_typecast(self, actual_type, needed_dtype, s): + if needed_dtype.dtype.kind == "b" and actual_type.dtype.kind == "f": # CL does not perform implicit conversion from float-type to a bool. from pymbolic.primitives import Comparison @@ -558,7 +559,13 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s): # convert_(src) where n # is the number of elements in the vector which is the same as in src. # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts - if self.codegen_state.target.is_vector_dtype(actual_type): + + # We infer the data type of (s) before we recurse down into (s) to convert + # to a CExpression. With vectorization, we can change the actual type of (s) + # from a scalar type to a vector type. So we are going to recompute the + # actual type. + type_of_s = self.infer_type(s) + if self.codegen_state.target.is_vector_dtype(type_of_s): cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype)) return cast(s) @@ -575,7 +582,7 @@ def map_variable(self, expr, type_context): if self.codegen_state.vectorization_info: if self.codegen_state.vectorization_info.iname == expr.name: # This needs to be converted into a vector literal. - from loopy.symbolic import Literal + from loopy.symbolic import TypedLiteral vector_length = self.codegen_state.vectorization_info.length index_type = self.codegen_state.kernel.index_dtype vector_type = self.codegen_state.target.vector_dtype(index_type, @@ -583,7 +590,9 @@ def map_variable(self, expr, type_context): typename = self.codegen_state.target.dtype_to_typename(vector_type) vector_literal = f"(({typename})" + " (" + \ ",".join([f"{i}" for i in range(vector_length)]) + "))" - return Literal(vector_literal) + return TypedLiteral(vector_literal, vector_type) + + # return Literal(vector_literal) return super().map_variable(expr, type_context) def map_if(self, expr, type_context): @@ -615,15 +624,17 @@ def map_if(self, expr, type_context): types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32), 2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)} length = self.codegen_state.vectorization_info.length - if (index_type.itemsize != result_type.itemsize and - result_type.itemsize in types): - # Need to convert index type into result type size. - # Item size is measured in bytes. - index_type = types[result_type.itemsize] - elif (index_type.itemsize * length != result_type.itemsize and - (result_type.itemsize // length) in types): - - index_type = types[result_type.itemsize // length] + if self.codegen_state.target.is_vector_dtype(result_type): + if (index_type.itemsize != result_type.itemsize and + (result_type.itemsize // length) in types): + index_type = types[result_type.itemsize] + else: + raise ValueError("Types incompatible") + else: + # We know result is going to be a vector. + if (index_type.itemsize != result_type.itemsize and + result_type.itemsize in types): + index_type = types[result_type.itemsize] vector_type = self.codegen_state.target.vector_dtype(index_type, length) conditional_needed_loopy_type = to_loopy_type(vector_type) From 337f5e4045be23bf34571e7c80ea4c3eb6da46bd Mon Sep 17 00:00:00 2001 From: Nick Date: Wed, 12 Mar 2025 17:34:34 -0500 Subject: [PATCH 14/18] Use the typed literal instead of literal. --- loopy/target/c/codegen/expression.py | 17 +++++++++-------- loopy/target/ispc.py | 8 ++++---- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 83c13dfe5..6b52256e9 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -49,7 +49,7 @@ from loopy.expression import dtype_to_type_context from loopy.target.c import CExpression from loopy.type_inference import TypeInferenceMapper, TypeReader -from loopy.types import LoopyType +from loopy.types import LoopyType, to_loopy_type from loopy.typing import Expression, is_integer @@ -435,7 +435,7 @@ def map_type_cast(self, expr: TypeCast, type_context: str): return self.rec(expr.child, type_context, expr.type) def map_constant(self, expr, type_context): - from loopy.symbolic import Literal + from loopy.symbolic import TypedLiteral if isinstance(expr, (complex, np.complexfloating)): real = self.rec(expr.real, type_context) @@ -462,10 +462,10 @@ def map_constant(self, expr, type_context): # FIXME: This assumes a 32-bit architecture. if isinstance(expr, np.float32): - return Literal(repr(float(expr))+"f") + return TypedLiteral(repr(float(expr))+"f", to_loopy_type(np.float32)) elif isinstance(expr, np.float64): - return Literal(repr(float(expr))) + return TypedLiteral(repr(float(expr)), to_loopy_type(np.float64)) # Disabled for now, possibly should be a subtarget. # elif isinstance(expr, np.float128): @@ -478,18 +478,19 @@ def map_constant(self, expr, type_context): suffix += "u" if iinfo.max > (2**31-1): suffix += "l" - return Literal(repr(int(expr))+suffix) + return TypedLiteral(repr(int(expr))+suffix, to_loopy_type(iinfo.dtype)) elif isinstance(expr, np.bool_): - return Literal("true") if expr else Literal("false") + return TypedLiteral("true", to_loopy_type(np.bool_)) if expr \ + else TypedLiteral("false", to_loopy_type(np.bool_)) else: raise LoopyError("do not know how to generate code for " "constant of numpy type '%s'" % type(expr).__name__) elif np.isfinite(expr): if type_context == "f": - return Literal(repr(float(expr))+"f") + return TypedLiteral(repr(float(expr))+"f", to_loopy_type(np.float32)) elif type_context == "d": - return Literal(repr(float(expr))) + return TypedLiteral(repr(float(expr)), to_loopy_type(np.float64)) elif type_context in ["i", "b"]: return int(expr) else: diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 34a88328c..823bf4218 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -44,14 +44,14 @@ CoefficientCollector, CombineMapper, GroupHardwareAxisIndex, - Literal, + TypedLiteral, LocalHardwareAxisIndex, SubstitutionMapper, flatten, ) from loopy.target.c import CFamilyASTBuilder, CFamilyTarget from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper - +from loopy.types import to_loopy_type if TYPE_CHECKING: from loopy.codegen import CodeGenerationState @@ -125,10 +125,10 @@ def map_constant(self, expr, type_context): raise NotImplementedError("complex numbers in ispc") else: if type_context == "f": - return Literal(repr(float(expr))) + return TypedLiteral(repr(float(expr)), to_loopy_type(np.float32)) elif type_context == "d": # Keepin' the good ideas flowin' since '66. - return Literal(repr(float(expr))+"d") + return TypedLiteral(repr(float(expr))+"d", to_loopy_type(np.float64)) elif type_context in ["i", "b"]: return expr else: From a52ea24025b623e903c4d5510184d4578c3985ff Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 13 Mar 2025 15:20:50 -0500 Subject: [PATCH 15/18] Give the vector type when appropriate. --- loopy/type_inference.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 8894af573..36ce49460 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -47,6 +47,7 @@ SubArrayRef, SubstitutionRuleExpander, SubstitutionRuleMappingContext, + TypedLiteral, parse_tagged_name, ) from loopy.translation_unit import ( @@ -365,6 +366,9 @@ def map_quotient(self, expr): else: return self.combine([n_dtype_set, d_dtype_set]) + def map_typed_literal(self, expr: TypedLiteral): + return [expr.dtype] + def map_constant(self, expr): if isinstance(expr, np.generic): return [NumpyType(np.dtype(type(expr)))] @@ -540,19 +544,40 @@ def map_lookup(self, expr): dtype = field[0] return [NumpyType(dtype)] + def is_vector_dtype(self, dtype): + target = self.kernel.target + + return target.is_vector_dtype(dtype) + def map_comparison(self, expr): - self(expr.left, return_tuple=False, return_dtype_set=False) - self(expr.right, return_tuple=False, return_dtype_set=False) + left = self(expr.left, return_tuple=False, return_dtype_set=False) + right = self(expr.right, return_tuple=False, return_dtype_set=False) + # We need to return a vector type if we either of the sides is a vector. + + vector_output = [] + for dtype in (left, right): + if self.is_vector_dtype(dtype): + vector_output.append(dtype) + if vector_output: + return vector_output return [NumpyType(np.dtype(np.bool_))] def map_logical_not(self, expr): - self.rec(expr.child) + child = self.rec(expr.child) + if self.is_vector_dtype(child): + return child return [NumpyType(np.dtype(np.bool_))] def map_logical_and(self, expr): + output_type = [] for child in expr.children: - self.rec(child) + type_to_check = self.rec(child) + if self.is_vector_dtype(type_to_check): + output_type.append(type_to_check) + + if output_type: + return output_type return [NumpyType(np.dtype(np.bool_))] From 420d16fe4ec055feb539b1ccd661cf9af5ae5d30 Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 13 Mar 2025 15:21:54 -0500 Subject: [PATCH 16/18] Ruff ispc. --- loopy/target/ispc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 823bf4218..bcee0e905 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -44,15 +44,16 @@ CoefficientCollector, CombineMapper, GroupHardwareAxisIndex, - TypedLiteral, LocalHardwareAxisIndex, SubstitutionMapper, + TypedLiteral, flatten, ) from loopy.target.c import CFamilyASTBuilder, CFamilyTarget from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper from loopy.types import to_loopy_type + if TYPE_CHECKING: from loopy.codegen import CodeGenerationState from loopy.codegen.result import CodeGenerationResult From a624ffd39a8ad4fb17b2b8be3df93fd71572dabf Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 13 Mar 2025 15:33:02 -0500 Subject: [PATCH 17/18] More ruff. --- test/test_target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_target.py b/test/test_target.py index 9231ffe78..b9d22cd87 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -898,7 +898,7 @@ def test_cl_vectorize_index_variable(ctx_factory): i = np.arange(16) j = np.arange(4) - ind = 4*i[:,None] + j + ind = 4*i[:, None] + j result_ref = np.where(ind < 32, a*3, np.sin(a)) assert np.allclose(result, result_ref) From 34e50f1c37a4392a15ac8a9eba7c10facf46a5e7 Mon Sep 17 00:00:00 2001 From: Nick Date: Thu, 13 Mar 2025 16:58:28 -0500 Subject: [PATCH 18/18] Add the for tag to i_outer. --- test/test_target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_target.py b/test/test_target.py index b9d22cd87..76a38518d 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -884,7 +884,7 @@ def test_cl_vectorize_index_variable(ctx_factory): knl = lp.split_array_axis(knl, "a,b", 0, 4) knl = lp.split_iname(knl, "i", 4) - knl = lp.tag_inames(knl, {"i_inner": "vec"}) + knl = lp.tag_inames(knl, {"i_inner": "vec", "i_outer": "for"}) knl = lp.tag_array_axes(knl, "a,b", "c,vec") knl = lp.set_options(knl, write_code=True) knl = lp.assume(knl, "n % 4 = 0 and n>0")