From 9874d9cc376ccfe4c4cc41ecd05b87fab6809715 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 20 Feb 2025 17:20:05 -0600
Subject: [PATCH 01/18] Add a vector constant of the appropriate size.

---
 loopy/expression.py |  9 ++++-----
 test/test_target.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)
diff --git a/loopy/expression.py b/loopy/expression.py
index 5a11b8354..881e41ac6 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -22,7 +22,9 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-from typing import TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+)
 
 import numpy as np
 
@@ -162,10 +164,7 @@ def map_constant(self, expr: object) -> bool:
 
     def map_variable(self, expr: p.Variable) -> bool:
         if expr.name == self.vec_iname:
-            # Technically, this is doable. But we're not going there.
-            raise UnvectorizableError()
-
-        # A single variable is always a scalar.
+            return True
         return False
 
     map_tagged_variable = map_variable
diff --git a/test/test_target.py b/test/test_target.py
index fe2ad1d8a..5e9014f0a 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -875,6 +875,36 @@ def test_float3():
     assert "float3" in device_code
 
 
+def test_cl_vectorize_index_variable(ctx_factory):
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            """
+            b[i] = a[i]*3 if i < 32 else sin(a[i])
+            """)
+
+    knl = lp.split_array_axis(knl, "a,b", 0, 4)
+    knl = lp.split_iname(knl, "i", 4)
+    knl = lp.tag_inames(knl, {"i_inner": "vec"})
+    knl = lp.tag_array_axes(knl, "a,b", "c,vec")
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.assume(knl, "n % 4 = 0 and n>0")
+
+    rng = np.random.default_rng(seed=12)
+    a = rng.normal(size=(16, 4))
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64})
+    _evt, (result,) = knl(queue, a=a, n=a.size)
+
+    result_ref = np.zeros(a.shape, dtype=np.float64)
+    for i in range(16):
+        for j in range(4):
+            ind = i*4 + j
+            result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j])
+
+    assert np.allclose(result, result_ref)
+
+
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1:

From 2bf9c8bf366b134dc12315431b34e14736c5bdc7 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 13:52:31 -0600
Subject: [PATCH 02/18] Move the OpenCL specifics to an OpenCL mapper. Modify
 typecast for vector types.

---
 loopy/target/c/codegen/expression.py | 41 +++++++++++++++++++++++++++-
 loopy/target/opencl.py               | 24 ++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 83c13dfe5..03ce925bc 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -130,6 +130,12 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
         if actual_type != needed_type:
             registry = self.codegen_state.ast_builder.target.get_dtype_registry()
             cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
+            if self.codegen_state.target.is_vector_dtype(needed_type):
+                # OpenCL does not let you do explicit vector type casts.
+                # Instead you need to call their function which is of the form
+                # convert_<desttype><n>(src) where desttype is the type you want and n
+                # is the number of elements in the vector which is the same as in src.
+                cast = var("convert_%s" % registry.dtype_to_ctype(needed_type))
             return cast(s)
 
         return s
@@ -414,9 +420,37 @@ def map_remainder(self, expr, type_context):
     def map_if(self, expr, type_context):
         from loopy.types import to_loopy_type
         result_type = self.infer_type(expr)
+        conditional_needed_loopy_type = to_loopy_type(np.bool_)
+        if self.codegen_state.vectorization_info:
+            from loopy.expression import VectorizabilityChecker
+            from loopy.codegen import UnvectorizableError
+            checker = VectorizabilityChecker(self.codegen_state.kernel,
+                                     self.codegen_state.vectorization_info.iname,
+                                     self.codegen_state.vectorization_info.length)
+
+            try:
+                is_vector = checker(expr)
+
+                if is_vector:
+                    """
+                    We could have a vector literal here.
+                    So we may need to type cast the condition.
+                    OpenCL specification states that for ( c ? a : b)
+                    to be vectorized appropriately c must have the same
+                    number of elements in the vector as that of a and b.
+                    Also each element must have the same number of bits,
+                    and c must be an integral type.
+                    """
+                    index_type = to_loopy_type(np.int64)
+                    if type_context == "f":
+                        index_type = to_loopy_type(np.int32)
+                    conditional_needed_loopy_type = to_loopy_type(self.codegen_state.target.vector_dtype(index_type,
+                                                    self.codegen_state.vectorization_info.length))
+            except UnvectorizableError:
+                pass
         return type(expr)(
                 self.rec(expr.condition, type_context,
-                         to_loopy_type(np.bool_)),
+                         conditional_needed_loopy_type),
                 self.rec(expr.then, type_context, result_type),
                 self.rec(expr.else_, type_context, result_type),
                 )
@@ -712,8 +746,13 @@ def map_min(self, expr, enclosing_prec):
 
     map_max = map_min
 
+    def map_type_cast(self, expr, enclosing_prec):
+        breakpoint()
+        return super().map_type_cast(expr, enclosing_prec)
+
     def map_if(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
+        breakpoint()
         return "({} ? {} : {})".format(
                 # Force parentheses around the condition to prevent compiler
                 # warnings regarding precedence (e.g. with POCL 1.8/LLVM 12):
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 07c5b49d0..c6266b341 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -545,6 +545,15 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             from pymbolic.primitives import Comparison
             return Comparison(s, "!=", 0)
 
+        registry = self.codegen_state.ast_builder.target.get_dtype_registry()
+        if self.codegen_state.target.is_vector_dtype(needed_dtype):
+            # OpenCL does not let you do explicit vector type casts.
+            # Instead you need to call their function which is of the form
+            # convert_<desttype><n>(src) where desttype is the type you want and n
+            # is the number of elements in the vector which is the same as in src.
+            cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
+            return cast(s)
+
         return super().wrap_in_typecast(actual_type, needed_dtype, s)
 
     def map_group_hw_index(self, expr, type_context):
@@ -553,6 +562,21 @@ def map_group_hw_index(self, expr, type_context):
     def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
+    def map_variable(self, expr, type_context):
+
+        if self.codegen_state.vectorization_info:
+            if self.codegen_state.vectorization_info.iname == expr.name:
+                # This needs to be converted into a vector literal.
+                from loopy.symbolic import Literal
+                vector_length = self.codegen_state.vectorization_info.length
+                index_type = self.codegen_state.kernel.index_dtype
+                vector_type = self.codegen_state.target.vector_dtype(index_type,
+                                                                     vector_length)
+                typecast = self.codegen_state.target.dtype_to_typename(vector_type)
+                vector_literal = f"(({typecast})" + " (" + \
+                        ",".join([f"{i}" for i in range(vector_length)]) + "))"
+                return Literal(vector_literal)
+        return super().map_variable(expr, type_context)
 # }}}
 
 

From 133055a4aff226c95580afa85a6b3d24c1438fdd Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 13:58:04 -0600
Subject: [PATCH 03/18] Remove unused code.

---
 loopy/target/c/codegen/expression.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 03ce925bc..c4cf480ba 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -130,14 +130,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
         if actual_type != needed_type:
             registry = self.codegen_state.ast_builder.target.get_dtype_registry()
             cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
-            if self.codegen_state.target.is_vector_dtype(needed_type):
-                # OpenCL does not let you do explicit vector type casts.
-                # Instead you need to call their function which is of the form
-                # convert_<desttype><n>(src) where desttype is the type you want and n
-                # is the number of elements in the vector which is the same as in src.
-                cast = var("convert_%s" % registry.dtype_to_ctype(needed_type))
-            return cast(s)
 
+            return cast(s)
         return s
 
     def rec(self, expr, type_context=None, needed_type: LoopyType | None = None):  # type: ignore[override]
@@ -422,8 +416,8 @@ def map_if(self, expr, type_context):
         result_type = self.infer_type(expr)
         conditional_needed_loopy_type = to_loopy_type(np.bool_)
         if self.codegen_state.vectorization_info:
-            from loopy.expression import VectorizabilityChecker
             from loopy.codegen import UnvectorizableError
+            from loopy.expression import VectorizabilityChecker
             checker = VectorizabilityChecker(self.codegen_state.kernel,
                                      self.codegen_state.vectorization_info.iname,
                                      self.codegen_state.vectorization_info.length)
@@ -746,13 +740,8 @@ def map_min(self, expr, enclosing_prec):
 
     map_max = map_min
 
-    def map_type_cast(self, expr, enclosing_prec):
-        breakpoint()
-        return super().map_type_cast(expr, enclosing_prec)
-
     def map_if(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_CALL, PREC_NONE
-        breakpoint()
         return "({} ? {} : {})".format(
                 # Force parentheses around the condition to prevent compiler
                 # warnings regarding precedence (e.g. with POCL 1.8/LLVM 12):

From a1f960638c311584c0504fa08542bdbba884c2e6 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 14:21:34 -0600
Subject: [PATCH 04/18] Modify the typecast for vector dtypes.

---
 loopy/target/c/codegen/expression.py | 32 ++--------------------
 loopy/target/opencl.py               | 41 ++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index c4cf480ba..83c13dfe5 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -130,8 +130,8 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
         if actual_type != needed_type:
             registry = self.codegen_state.ast_builder.target.get_dtype_registry()
             cast = var("(%s) " % registry.dtype_to_ctype(needed_type))
-
             return cast(s)
+
         return s
 
     def rec(self, expr, type_context=None, needed_type: LoopyType | None = None):  # type: ignore[override]
@@ -414,37 +414,9 @@ def map_remainder(self, expr, type_context):
     def map_if(self, expr, type_context):
         from loopy.types import to_loopy_type
         result_type = self.infer_type(expr)
-        conditional_needed_loopy_type = to_loopy_type(np.bool_)
-        if self.codegen_state.vectorization_info:
-            from loopy.codegen import UnvectorizableError
-            from loopy.expression import VectorizabilityChecker
-            checker = VectorizabilityChecker(self.codegen_state.kernel,
-                                     self.codegen_state.vectorization_info.iname,
-                                     self.codegen_state.vectorization_info.length)
-
-            try:
-                is_vector = checker(expr)
-
-                if is_vector:
-                    """
-                    We could have a vector literal here.
-                    So we may need to type cast the condition.
-                    OpenCL specification states that for ( c ? a : b)
-                    to be vectorized appropriately c must have the same
-                    number of elements in the vector as that of a and b.
-                    Also each element must have the same number of bits,
-                    and c must be an integral type.
-                    """
-                    index_type = to_loopy_type(np.int64)
-                    if type_context == "f":
-                        index_type = to_loopy_type(np.int32)
-                    conditional_needed_loopy_type = to_loopy_type(self.codegen_state.target.vector_dtype(index_type,
-                                                    self.codegen_state.vectorization_info.length))
-            except UnvectorizableError:
-                pass
         return type(expr)(
                 self.rec(expr.condition, type_context,
-                         conditional_needed_loopy_type),
+                         to_loopy_type(np.bool_)),
                 self.rec(expr.then, type_context, result_type),
                 self.rec(expr.else_, type_context, result_type),
                 )
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index c6266b341..c4453227d 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -577,6 +577,47 @@ def map_variable(self, expr, type_context):
                         ",".join([f"{i}" for i in range(vector_length)]) + "))"
                 return Literal(vector_literal)
         return super().map_variable(expr, type_context)
+
+    def map_if(self, expr, type_context):
+        from loopy.types import to_loopy_type
+        result_type = self.infer_type(expr)
+        conditional_needed_loopy_type = to_loopy_type(np.bool_)
+        if self.codegen_state.vectorization_info:
+            from loopy.codegen import UnvectorizableError
+            from loopy.expression import VectorizabilityChecker
+            checker = VectorizabilityChecker(self.codegen_state.kernel,
+                                     self.codegen_state.vectorization_info.iname,
+                                     self.codegen_state.vectorization_info.length)
+
+            try:
+                is_vector = checker(expr)
+
+                if is_vector:
+                    """
+                    We could have a vector literal here.
+                    So we may need to type cast the condition.
+                    OpenCL specification states that for ( c ? a : b)
+                    to be vectorized appropriately c must have the same
+                    number of elements in the vector as that of a and b.
+                    Also each element must have the same number of bits,
+                    and c must be an integral type.
+                    """
+                    index_type = to_loopy_type(np.int64)
+                    if type_context == "f":
+                        index_type = to_loopy_type(np.int32)
+                    length = self.codegen_state.vectorization_info.length
+                    vector_type = self.codegen_state.target.vector_dtype(index_type,
+                                                                         length)
+                    conditional_needed_loopy_type = to_loopy_type(vector_type)
+            except UnvectorizableError:
+                pass
+
+        return type(expr)(
+                self.rec(expr.condition, type_context,
+                         conditional_needed_loopy_type),
+                self.rec(expr.then, type_context, result_type),
+                self.rec(expr.else_, type_context, result_type),
+                )
 # }}}
 
 

From 65766ba0fb1934fd848397a2ba004a9ebfb57c63 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 15:05:38 -0600
Subject: [PATCH 05/18] Just return the value if we do not need to typecast.

---
 loopy/target/opencl.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index c4453227d..b57747e54 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -46,6 +46,7 @@
 
     from loopy.codegen import CodeGenerationState
     from loopy.codegen.result import CodeGenerationResult
+    from loopy.kernel import LoopKernel
 
 
 # {{{ dtype registry wrappers
@@ -456,7 +457,7 @@ def get_opencl_callables():
 
 # {{{ symbol mangler
 
-def opencl_symbol_mangler(kernel, name):
+def opencl_symbol_mangler(kernel: LoopKernel, name: str):
     # FIXME: should be more picky about exact names
     if name.startswith("FLT_"):
         return NumpyType(np.dtype(np.float32)), name
@@ -544,25 +545,29 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             # CL does not perform implicit conversion from float-type to a bool.
             from pymbolic.primitives import Comparison
             return Comparison(s, "!=", 0)
+        
+        if needed_dtype == actual_type:
+            return s
 
         registry = self.codegen_state.ast_builder.target.get_dtype_registry()
-        if self.codegen_state.target.is_vector_dtype(needed_dtype):
-            # OpenCL does not let you do explicit vector type casts.
-            # Instead you need to call their function which is of the form
-            # convert_<desttype><n>(src) where desttype is the type you want and n
+        if self.codegen_state.target.is_vector_dtype(needed_dtype) and \
+            self.codegen_state.target.is_vector_dtype(actual_type):
+            # OpenCL does not let you do explicit vector type casts between vector
+            # types. Instead you need to call their function which is of the form
+            # <desttype> convert_<desttype><n>(src) where n
             # is the number of elements in the vector which is the same as in src.
             cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
             return cast(s)
 
         return super().wrap_in_typecast(actual_type, needed_dtype, s)
 
-    def map_group_hw_index(self, expr, type_context):
+    def map_group_hw_index(self, expr, type_context: str):
         return var("gid")(expr.axis)
 
-    def map_local_hw_index(self, expr, type_context):
+    def map_local_hw_index(self, expr, type_context: str):
         return var("lid")(expr.axis)
 
-    def map_variable(self, expr, type_context):
+    def map_variable(self, expr, type_context: str):
 
         if self.codegen_state.vectorization_info:
             if self.codegen_state.vectorization_info.iname == expr.name:
@@ -578,7 +583,7 @@ def map_variable(self, expr, type_context):
                 return Literal(vector_literal)
         return super().map_variable(expr, type_context)
 
-    def map_if(self, expr, type_context):
+    def map_if(self, expr, type_context: str):
         from loopy.types import to_loopy_type
         result_type = self.infer_type(expr)
         conditional_needed_loopy_type = to_loopy_type(np.bool_)

From 3061cc69da25e989b5dfe391662179124b000852 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 15:06:06 -0600
Subject: [PATCH 06/18] Ruff.

---
 loopy/target/opencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index b57747e54..67c5a5634 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -545,7 +545,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             # CL does not perform implicit conversion from float-type to a bool.
             from pymbolic.primitives import Comparison
             return Comparison(s, "!=", 0)
-        
+
         if needed_dtype == actual_type:
             return s
 

From af7d947ba5f8c221a8f400a7af3ba4876d499297 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 15:21:37 -0600
Subject: [PATCH 07/18] Use the convert function only if we are converting from
 a boolean or a previously vectorized dtype.

---
 loopy/target/opencl.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 67c5a5634..b27a15384 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -550,14 +550,15 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             return s
 
         registry = self.codegen_state.ast_builder.target.get_dtype_registry()
-        if self.codegen_state.target.is_vector_dtype(needed_dtype) and \
-            self.codegen_state.target.is_vector_dtype(actual_type):
+        if self.codegen_state.target.is_vector_dtype(needed_dtype):
             # OpenCL does not let you do explicit vector type casts between vector
             # types. Instead you need to call their function which is of the form
             # <desttype> convert_<desttype><n>(src) where n
             # is the number of elements in the vector which is the same as in src.
-            cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
-            return cast(s)
+            if self.codegen_state.target.is_vector_dtype(actual_type) or \
+                actual_type.dtype.kind == "b":
+                cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
+                return cast(s)
 
         return super().wrap_in_typecast(actual_type, needed_dtype, s)
 

From 141d9d72fc2161b16ec028988e9669265d643c77 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 15:38:42 -0600
Subject: [PATCH 08/18] Mypy changes.

---
 loopy/target/opencl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index b27a15384..121135662 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -562,13 +562,13 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
 
         return super().wrap_in_typecast(actual_type, needed_dtype, s)
 
-    def map_group_hw_index(self, expr, type_context: str):
+    def map_group_hw_index(self, expr, type_context):
         return var("gid")(expr.axis)
 
-    def map_local_hw_index(self, expr, type_context: str):
+    def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
-    def map_variable(self, expr, type_context: str):
+    def map_variable(self, expr, type_context):
 
         if self.codegen_state.vectorization_info:
             if self.codegen_state.vectorization_info.iname == expr.name:
@@ -584,7 +584,7 @@ def map_variable(self, expr, type_context: str):
                 return Literal(vector_literal)
         return super().map_variable(expr, type_context)
 
-    def map_if(self, expr, type_context: str):
+    def map_if(self, expr, type_context):
         from loopy.types import to_loopy_type
         result_type = self.infer_type(expr)
         conditional_needed_loopy_type = to_loopy_type(np.bool_)

From 7d95c97602533938e4ec6274f38a9cf1747bab55 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Mon, 24 Feb 2025 20:40:58 -0600
Subject: [PATCH 09/18] Respond to comments.

---
 loopy/expression.py    |  1 +
 loopy/target/opencl.py | 31 ++++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 881e41ac6..3a87dbe61 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -164,6 +164,7 @@ def map_constant(self, expr: object) -> bool:
 
     def map_variable(self, expr: p.Variable) -> bool:
         if expr.name == self.vec_iname:
+            # Technically, this is doable.
             return True
         return False
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 121135662..d58aecc13 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -457,7 +457,8 @@ def get_opencl_callables():
 
 # {{{ symbol mangler
 
-def opencl_symbol_mangler(kernel: LoopKernel, name: str):
+def opencl_symbol_mangler(kernel: LoopKernel,
+                          name: str) -> tuple[NumpyType, str] | None:
     # FIXME: should be more picky about exact names
     if name.startswith("FLT_"):
         return NumpyType(np.dtype(np.float32)), name
@@ -555,6 +556,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             # types. Instead you need to call their function which is of the form
             # <desttype> convert_<desttype><n>(src) where n
             # is the number of elements in the vector which is the same as in src.
+            # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
             if self.codegen_state.target.is_vector_dtype(actual_type) or \
                 actual_type.dtype.kind == "b":
                 cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
@@ -600,18 +602,25 @@ def map_if(self, expr, type_context):
 
                 if is_vector:
                     """
-                    We could have a vector literal here.
-                    So we may need to type cast the condition.
-                    OpenCL specification states that for ( c ? a : b)
-                    to be vectorized appropriately c must have the same
-                    number of elements in the vector as that of a and b.
-                    Also each element must have the same number of bits,
-                    and c must be an integral type.
+                    We could have a vector literal here which may need to be
+                    converted to an appropriate size. The OpenCL specification states
+                    that for ( c ? a : b) a, b, and c must have the same
+                    number of elements and bits and that c must be an integral type.
+                    https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#table-builtin-relational
                     """
-                    index_type = to_loopy_type(np.int64)
-                    if type_context == "f":
-                        index_type = to_loopy_type(np.int32)
+                    index_type = to_loopy_type(self.codegen_state.kernel.index_dtype)
+                    types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
+                             2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
                     length = self.codegen_state.vectorization_info.length
+                    if index_type.itemsize != result_type.itemsize and \
+                        result_type.itemsize in types.keys():
+                        # Need to convert index type into result type size.
+                        # Item size is measured in bytes.
+                        index_type = types[result_type.itemsize]
+                    elif index_type.itemsize * length != result_type.itemsize and \
+                        (result_type.itemsize // length) in types.keys():
+
+                        index_type = types[result_type.itemsize // length]
                     vector_type = self.codegen_state.target.vector_dtype(index_type,
                                                                          length)
                     conditional_needed_loopy_type = to_loopy_type(vector_type)

From 144b3df224f7bbd523a9f8ef162b696016abe8d7 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 27 Feb 2025 10:27:22 -0600
Subject: [PATCH 10/18] Respond to Andreas's comments.

---
 loopy/expression.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3a87dbe61..3c2b6755d 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -22,9 +22,7 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-from typing import (
-    TYPE_CHECKING,
-)
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -166,6 +164,7 @@ def map_variable(self, expr: p.Variable) -> bool:
         if expr.name == self.vec_iname:
             # Technically, this is doable.
             return True
+        # A single variable is always a scalar.
         return False
 
     map_tagged_variable = map_variable

From e261f65ae8b73461bf9a2bd7a74cc2f8b3e626bb Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Tue, 11 Mar 2025 10:53:37 -0500
Subject: [PATCH 11/18] Responding to comments.

---
 loopy/expression.py    |  1 -
 loopy/target/opencl.py | 23 ++++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3c2b6755d..a00e4c7f0 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -162,7 +162,6 @@ def map_constant(self, expr: object) -> bool:
 
     def map_variable(self, expr: p.Variable) -> bool:
         if expr.name == self.vec_iname:
-            # Technically, this is doable.
             return True
         # A single variable is always a scalar.
         return False
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index d58aecc13..1b244412d 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -24,6 +24,7 @@
 THE SOFTWARE.
 """
 
+from contextlib import suppress
 from typing import TYPE_CHECKING, Literal, Sequence
 
 import numpy as np
@@ -557,8 +558,7 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             # <desttype> convert_<desttype><n>(src) where n
             # is the number of elements in the vector which is the same as in src.
             # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
-            if self.codegen_state.target.is_vector_dtype(actual_type) or \
-                actual_type.dtype.kind == "b":
+            if self.codegen_state.target.is_vector_dtype(actual_type):
                 cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
                 return cast(s)
 
@@ -580,8 +580,8 @@ def map_variable(self, expr, type_context):
                 index_type = self.codegen_state.kernel.index_dtype
                 vector_type = self.codegen_state.target.vector_dtype(index_type,
                                                                      vector_length)
-                typecast = self.codegen_state.target.dtype_to_typename(vector_type)
-                vector_literal = f"(({typecast})" + " (" + \
+                typename = self.codegen_state.target.dtype_to_typename(vector_type)
+                vector_literal = f"(({typename})" + " (" + \
                         ",".join([f"{i}" for i in range(vector_length)]) + "))"
                 return Literal(vector_literal)
         return super().map_variable(expr, type_context)
@@ -597,7 +597,10 @@ def map_if(self, expr, type_context):
                                      self.codegen_state.vectorization_info.iname,
                                      self.codegen_state.vectorization_info.length)
 
-            try:
+            with suppress(UnvectorizableError):
+                # We know there is an expression in codegen which can be vectorized.
+                # We are checking if this is one of the them. If it is not, then we can
+                # just continue with scalar code generation for this expression.
                 is_vector = checker(expr)
 
                 if is_vector:
@@ -612,20 +615,18 @@ def map_if(self, expr, type_context):
                     types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
                              2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
                     length = self.codegen_state.vectorization_info.length
-                    if index_type.itemsize != result_type.itemsize and \
-                        result_type.itemsize in types.keys():
+                    if (index_type.itemsize != result_type.itemsize and
+                        result_type.itemsize in types):
                         # Need to convert index type into result type size.
                         # Item size is measured in bytes.
                         index_type = types[result_type.itemsize]
-                    elif index_type.itemsize * length != result_type.itemsize and \
-                        (result_type.itemsize // length) in types.keys():
+                    elif (index_type.itemsize * length != result_type.itemsize and
+                        (result_type.itemsize // length) in types):
 
                         index_type = types[result_type.itemsize // length]
                     vector_type = self.codegen_state.target.vector_dtype(index_type,
                                                                          length)
                     conditional_needed_loopy_type = to_loopy_type(vector_type)
-            except UnvectorizableError:
-                pass
 
         return type(expr)(
                 self.rec(expr.condition, type_context,

From 89474ea18af77b910ab947f3482850e6a51e6482 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Tue, 11 Mar 2025 14:08:21 -0500
Subject: [PATCH 12/18] Use the numpy vectorized version.

---
 test/test_target.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_target.py b/test/test_target.py
index 5e9014f0a..9231ffe78 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -896,11 +896,10 @@ def test_cl_vectorize_index_variable(ctx_factory):
     knl = lp.add_and_infer_dtypes(knl, {"a": np.float64, "n": np.int64})
     _evt, (result,) = knl(queue, a=a, n=a.size)
 
-    result_ref = np.zeros(a.shape, dtype=np.float64)
-    for i in range(16):
-        for j in range(4):
-            ind = i*4 + j
-            result_ref[i, j] = a[i, j] * 3 if ind < 32 else np.sin(a[i, j])
+    i = np.arange(16)
+    j = np.arange(4)
+    ind = 4*i[:,None] + j
+    result_ref = np.where(ind < 32, a*3, np.sin(a))
 
     assert np.allclose(result, result_ref)
 

From 1b9334de6e28c1ba6a1ba9efb583ec005fb6c610 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Wed, 12 Mar 2025 17:13:27 -0500
Subject: [PATCH 13/18] Add typed literal.

---
 loopy/symbolic.py      | 15 +++++++++++++++
 loopy/target/opencl.py | 35 +++++++++++++++++++++++------------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 20ff55fea..2bdd8db70 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -606,6 +606,21 @@ class Literal(LoopyExpressionBase):
     s: str
 
 
+@p.expr_dataclass()
+class TypedLiteral(Literal):
+    """A literal to be used during code generation which we know the type of.
+
+    .. note::
+
+        Only used in the output of
+        :mod:`loopy.target.c.codegen.expression.ExpressionToCExpressionMapper` (and
+        similar mappers). Not for use in Loopy source representation.
+    """
+
+    s: str
+    dtype: ToLoopyTypeConvertible
+
+
 @p.expr_dataclass()
 class ArrayLiteral(LoopyExpressionBase):
     """An array literal.
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 1b244412d..524ed7e6e 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -543,6 +543,7 @@ def opencl_preamble_generator(preamble_info):
 class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
 
     def wrap_in_typecast(self, actual_type, needed_dtype, s):
+
         if needed_dtype.dtype.kind == "b" and actual_type.dtype.kind == "f":
             # CL does not perform implicit conversion from float-type to a bool.
             from pymbolic.primitives import Comparison
@@ -558,7 +559,13 @@ def wrap_in_typecast(self, actual_type, needed_dtype, s):
             # <desttype> convert_<desttype><n>(src) where n
             # is the number of elements in the vector which is the same as in src.
             # https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#explicit-casts
-            if self.codegen_state.target.is_vector_dtype(actual_type):
+
+            # We infer the data type of (s) before we recurse down into (s) to convert
+            # to a CExpression. With vectorization, we can change the actual type of (s)
+            # from a scalar type to a vector type. So we are going to recompute the
+            # actual type.
+            type_of_s = self.infer_type(s)
+            if self.codegen_state.target.is_vector_dtype(type_of_s):
                 cast = var("convert_%s" % registry.dtype_to_ctype(needed_dtype))
                 return cast(s)
 
@@ -575,7 +582,7 @@ def map_variable(self, expr, type_context):
         if self.codegen_state.vectorization_info:
             if self.codegen_state.vectorization_info.iname == expr.name:
                 # This needs to be converted into a vector literal.
-                from loopy.symbolic import Literal
+                from loopy.symbolic import TypedLiteral
                 vector_length = self.codegen_state.vectorization_info.length
                 index_type = self.codegen_state.kernel.index_dtype
                 vector_type = self.codegen_state.target.vector_dtype(index_type,
@@ -583,7 +590,9 @@ def map_variable(self, expr, type_context):
                 typename = self.codegen_state.target.dtype_to_typename(vector_type)
                 vector_literal = f"(({typename})" + " (" + \
                         ",".join([f"{i}" for i in range(vector_length)]) + "))"
-                return Literal(vector_literal)
+                return TypedLiteral(vector_literal, vector_type)
+
+                # return Literal(vector_literal)
         return super().map_variable(expr, type_context)
 
     def map_if(self, expr, type_context):
@@ -615,15 +624,17 @@ def map_if(self, expr, type_context):
                     types = {8: to_loopy_type(np.int64), 4: to_loopy_type(np.int32),
                              2: to_loopy_type(np.int16), 1: to_loopy_type(np.int8)}
                     length = self.codegen_state.vectorization_info.length
-                    if (index_type.itemsize != result_type.itemsize and
-                        result_type.itemsize in types):
-                        # Need to convert index type into result type size.
-                        # Item size is measured in bytes.
-                        index_type = types[result_type.itemsize]
-                    elif (index_type.itemsize * length != result_type.itemsize and
-                        (result_type.itemsize // length) in types):
-
-                        index_type = types[result_type.itemsize // length]
+                    if self.codegen_state.target.is_vector_dtype(result_type):
+                        if (index_type.itemsize != result_type.itemsize and
+                            (result_type.itemsize // length) in types):
+                            index_type = types[result_type.itemsize]
+                        else:
+                            raise ValueError("Types incompatible")
+                    else:
+                        # We know result is going to be a vector.
+                        if (index_type.itemsize != result_type.itemsize and
+                            result_type.itemsize in types):
+                            index_type = types[result_type.itemsize]
                     vector_type = self.codegen_state.target.vector_dtype(index_type,
                                                                          length)
                     conditional_needed_loopy_type = to_loopy_type(vector_type)

From 337f5e4045be23bf34571e7c80ea4c3eb6da46bd Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Wed, 12 Mar 2025 17:34:34 -0500
Subject: [PATCH 14/18] Use the typed literal instead of literal.

---
 loopy/target/c/codegen/expression.py | 17 +++++++++--------
 loopy/target/ispc.py                 |  8 ++++----
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 83c13dfe5..6b52256e9 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -49,7 +49,7 @@
 from loopy.expression import dtype_to_type_context
 from loopy.target.c import CExpression
 from loopy.type_inference import TypeInferenceMapper, TypeReader
-from loopy.types import LoopyType
+from loopy.types import LoopyType, to_loopy_type
 from loopy.typing import Expression, is_integer
 
 
@@ -435,7 +435,7 @@ def map_type_cast(self, expr: TypeCast, type_context: str):
         return self.rec(expr.child, type_context, expr.type)
 
     def map_constant(self, expr, type_context):
-        from loopy.symbolic import Literal
+        from loopy.symbolic import TypedLiteral
 
         if isinstance(expr, (complex, np.complexfloating)):
             real = self.rec(expr.real, type_context)
@@ -462,10 +462,10 @@ def map_constant(self, expr, type_context):
 
             # FIXME: This assumes a 32-bit architecture.
             if isinstance(expr, np.float32):
-                return Literal(repr(float(expr))+"f")
+                return TypedLiteral(repr(float(expr))+"f", to_loopy_type(np.float32))
 
             elif isinstance(expr, np.float64):
-                return Literal(repr(float(expr)))
+                return TypedLiteral(repr(float(expr)), to_loopy_type(np.float64))
 
             # Disabled for now, possibly should be a subtarget.
             # elif isinstance(expr, np.float128):
@@ -478,18 +478,19 @@ def map_constant(self, expr, type_context):
                     suffix += "u"
                 if iinfo.max > (2**31-1):
                     suffix += "l"
-                return Literal(repr(int(expr))+suffix)
+                return TypedLiteral(repr(int(expr))+suffix, to_loopy_type(iinfo.dtype))
             elif isinstance(expr, np.bool_):
-                return Literal("true") if expr else Literal("false")
+                return TypedLiteral("true", to_loopy_type(np.bool_)) if expr \
+                        else TypedLiteral("false", to_loopy_type(np.bool_))
             else:
                 raise LoopyError("do not know how to generate code for "
                         "constant of numpy type '%s'" % type(expr).__name__)
 
         elif np.isfinite(expr):
             if type_context == "f":
-                return Literal(repr(float(expr))+"f")
+                return TypedLiteral(repr(float(expr))+"f", to_loopy_type(np.float32))
             elif type_context == "d":
-                return Literal(repr(float(expr)))
+                return TypedLiteral(repr(float(expr)), to_loopy_type(np.float64))
             elif type_context in ["i", "b"]:
                 return int(expr)
             else:
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 34a88328c..823bf4218 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -44,14 +44,14 @@
     CoefficientCollector,
     CombineMapper,
     GroupHardwareAxisIndex,
-    Literal,
+    TypedLiteral,
     LocalHardwareAxisIndex,
     SubstitutionMapper,
     flatten,
 )
 from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
-
+from loopy.types import to_loopy_type
 
 if TYPE_CHECKING:
     from loopy.codegen import CodeGenerationState
@@ -125,10 +125,10 @@ def map_constant(self, expr, type_context):
             raise NotImplementedError("complex numbers in ispc")
         else:
             if type_context == "f":
-                return Literal(repr(float(expr)))
+                return TypedLiteral(repr(float(expr)), to_loopy_type(np.float32))
             elif type_context == "d":
                 # Keepin' the good ideas flowin' since '66.
-                return Literal(repr(float(expr))+"d")
+                return TypedLiteral(repr(float(expr))+"d", to_loopy_type(np.float64))
             elif type_context in ["i", "b"]:
                 return expr
             else:

From a52ea24025b623e903c4d5510184d4578c3985ff Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 13 Mar 2025 15:20:50 -0500
Subject: [PATCH 15/18] Give the vector type when appropriate.

---
 loopy/type_inference.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 8894af573..36ce49460 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -47,6 +47,7 @@
     SubArrayRef,
     SubstitutionRuleExpander,
     SubstitutionRuleMappingContext,
+    TypedLiteral,
     parse_tagged_name,
 )
 from loopy.translation_unit import (
@@ -365,6 +366,9 @@ def map_quotient(self, expr):
         else:
             return self.combine([n_dtype_set, d_dtype_set])
 
+    def map_typed_literal(self, expr: TypedLiteral):
+        return [expr.dtype]
+
     def map_constant(self, expr):
         if isinstance(expr, np.generic):
             return [NumpyType(np.dtype(type(expr)))]
@@ -540,19 +544,40 @@ def map_lookup(self, expr):
         dtype = field[0]
         return [NumpyType(dtype)]
 
+    def is_vector_dtype(self, dtype):
+        target = self.kernel.target
+
+        return target.is_vector_dtype(dtype)
+
     def map_comparison(self, expr):
-        self(expr.left, return_tuple=False, return_dtype_set=False)
-        self(expr.right, return_tuple=False, return_dtype_set=False)
+        left = self(expr.left, return_tuple=False, return_dtype_set=False)
+        right = self(expr.right, return_tuple=False, return_dtype_set=False)
+        # We need to return a vector type if we either of the sides is a vector.
+
+        vector_output = []
+        for dtype in (left, right):
+            if self.is_vector_dtype(dtype):
+                vector_output.append(dtype)
+        if vector_output:
+            return vector_output
         return [NumpyType(np.dtype(np.bool_))]
 
     def map_logical_not(self, expr):
-        self.rec(expr.child)
+        child = self.rec(expr.child)
+        if self.is_vector_dtype(child):
+            return child
 
         return [NumpyType(np.dtype(np.bool_))]
 
     def map_logical_and(self, expr):
+        output_type = []
         for child in expr.children:
-            self.rec(child)
+            type_to_check = self.rec(child)
+            if self.is_vector_dtype(type_to_check):
+                output_type.append(type_to_check)
+
+        if output_type:
+            return output_type
 
         return [NumpyType(np.dtype(np.bool_))]
 

From 420d16fe4ec055feb539b1ccd661cf9af5ae5d30 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 13 Mar 2025 15:21:54 -0500
Subject: [PATCH 16/18] Ruff ispc.

---
 loopy/target/ispc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 823bf4218..bcee0e905 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -44,15 +44,16 @@
     CoefficientCollector,
     CombineMapper,
     GroupHardwareAxisIndex,
-    TypedLiteral,
     LocalHardwareAxisIndex,
     SubstitutionMapper,
+    TypedLiteral,
     flatten,
 )
 from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from loopy.types import to_loopy_type
 
+
 if TYPE_CHECKING:
     from loopy.codegen import CodeGenerationState
     from loopy.codegen.result import CodeGenerationResult

From a624ffd39a8ad4fb17b2b8be3df93fd71572dabf Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 13 Mar 2025 15:33:02 -0500
Subject: [PATCH 17/18] More ruff.

---
 test/test_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_target.py b/test/test_target.py
index 9231ffe78..b9d22cd87 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -898,7 +898,7 @@ def test_cl_vectorize_index_variable(ctx_factory):
 
     i = np.arange(16)
     j = np.arange(4)
-    ind = 4*i[:,None] + j
+    ind = 4*i[:, None] + j
     result_ref = np.where(ind < 32, a*3, np.sin(a))
 
     assert np.allclose(result, result_ref)

From 34e50f1c37a4392a15ac8a9eba7c10facf46a5e7 Mon Sep 17 00:00:00 2001
From: Nick <koskelo2@illinois.edu>
Date: Thu, 13 Mar 2025 16:58:28 -0500
Subject: [PATCH 18/18] Add the for tag to i_outer.

---
 test/test_target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_target.py b/test/test_target.py
index b9d22cd87..76a38518d 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -884,7 +884,7 @@ def test_cl_vectorize_index_variable(ctx_factory):
 
     knl = lp.split_array_axis(knl, "a,b", 0, 4)
     knl = lp.split_iname(knl, "i", 4)
-    knl = lp.tag_inames(knl, {"i_inner": "vec"})
+    knl = lp.tag_inames(knl, {"i_inner": "vec", "i_outer": "for"})
     knl = lp.tag_array_axes(knl, "a,b", "c,vec")
     knl = lp.set_options(knl, write_code=True)
     knl = lp.assume(knl, "n % 4 = 0 and n>0")