pmonday · 8f1c88a8
--- a/simd-vector-example-in-llvm.md
+++ b/simd-vector-example-in-llvm.md
@@ -218,7 +218,7 @@ Let's take a look at a program with a substantially larger array of floats to ad

 int main()
 {
-   int sz = 40000;
+   int sz = 128;
   float x[sz], y[sz], z[sz];
   int i;
   for (i = 0; i < sz; i++) {
@@ -244,94 +244,100 @@ In this case, the LLVM code is more compact and easier to work with if we use th
 The resulting optimized LLVM code is as follows:

 ```wiki
-; ModuleID = '/tmp/webcompile/_2358_0.bc'
+; ModuleID = '/tmp/webcompile/_15374_0.bc'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"

-@.str = private constant [13 x i8] c"%f %f %f %f\0A\00"
+@.str2 = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00"
+@str = internal constant [24 x i8] c"Entering initialization\00"
+@str3 = internal constant [18 x i8] c"Entering addition\00"

 define i32 @main() nounwind {
 ; <label>:0
  %1 = alloca [40000 x float], align 16
  %2 = alloca [40000 x float], align 16
  %3 = alloca [40000 x float], align 16
+  %puts = call i32 @puts(i8* getelementptr inbounds ([24 x i8]* @str, i64 0, i64 0))
  br label %4

-.preheader:                                       ; preds = %4
+; <label>:4                                       ; preds = %4, %0
+  %indvar21 = phi i64 [ 0, %0 ], [ %indvar.next22, %4 ]
+  %i.06 = trunc i64 %indvar21 to i32
+  %scevgep25 = getelementptr [40000 x float]* %2, i64 0, i64 %indvar21
+  %scevgep26 = getelementptr [40000 x float]* %1, i64 0, i64 %indvar21
+  %tmp27 = add i64 %indvar21, 40000
+  %tmp28 = trunc i64 %tmp27 to i32
+  %5 = sitofp i32 %i.06 to float
+  store float %5, float* %scevgep26, align 4, !tbaa !0
+  %6 = sitofp i32 %tmp28 to float
+  store float %6, float* %scevgep25, align 4, !tbaa !0
+  %indvar.next22 = add i64 %indvar21, 1
+  %exitcond23 = icmp eq i64 %indvar.next22, 40000
+  br i1 %exitcond23, label %7, label %4
+
+; <label>:7                                       ; preds = %4
  %.sub3 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 0
-  br label %7
+  %puts4 = call i32 @puts(i8* getelementptr inbounds ([18 x i8]* @str3, i64 0, i64 0))
+  br label %8

-; <label>:4                                       ; preds = %4, %0
-  %indvar20 = phi i64 [ 0, %0 ], [ %indvar.next21, %4 ]
-  %i.05 = trunc i64 %indvar20 to i32
-  %scevgep24 = getelementptr [40000 x float]* %2, i64 0, i64 %indvar20
-  %scevgep25 = getelementptr [40000 x float]* %1, i64 0, i64 %indvar20
-  %tmp26 = add i64 %indvar20, 40000
-  %tmp27 = trunc i64 %tmp26 to i32
-  %5 = sitofp i32 %i.05 to float
-  store float %5, float* %scevgep25, align 4, !tbaa !0
-  %6 = sitofp i32 %tmp27 to float
-  store float %6, float* %scevgep24, align 4, !tbaa !0
-  %indvar.next21 = add i64 %indvar20, 1
-  %exitcond22 = icmp eq i64 %indvar.next21, 40000
-  br i1 %exitcond22, label %.preheader, label %4
-
-; <label>:7                                       ; preds = %7, %.preheader
-  %indvar = phi i64 [ 0, %.preheader ], [ %indvar.next, %7 ]
+; <label>:8                                       ; preds = %8, %7
+  %indvar = phi i64 [ 0, %7 ], [ %indvar.next, %8 ]
  %tmp = shl i64 %indvar, 2
  %scevgep = getelementptr [40000 x float]* %3, i64 0, i64 %tmp
-  %scevgep6 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp
-  %scevgep7 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp
-  %tmp828 = or i64 %tmp, 1
-  %scevgep9 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp828
-  %scevgep10 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp828
-  %scevgep11 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp828
-  %tmp1229 = or i64 %tmp, 2
-  %scevgep13 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1229
-  %scevgep14 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1229
-  %scevgep15 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1229
-  %tmp1630 = or i64 %tmp, 3
-  %scevgep17 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1630
-  %scevgep18 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1630
-  %scevgep19 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1630
-  %8 = load float* %scevgep7, align 16, !tbaa !0
-  %9 = load float* %scevgep6, align 16, !tbaa !0
-  %10 = fadd float %8, %9
-  store float %10, float* %scevgep, align 16, !tbaa !0
-  %11 = load float* %scevgep11, align 4, !tbaa !0
-  %12 = load float* %scevgep10, align 4, !tbaa !0
-  %13 = fadd float %11, %12
-  store float %13, float* %scevgep9, align 4, !tbaa !0
-  %14 = load float* %scevgep15, align 8, !tbaa !0
-  %15 = load float* %scevgep14, align 8, !tbaa !0
-  %16 = fadd float %14, %15
-  store float %16, float* %scevgep13, align 8, !tbaa !0
-  %17 = load float* %scevgep19, align 4, !tbaa !0
-  %18 = load float* %scevgep18, align 4, !tbaa !0
-  %19 = fadd float %17, %18
-  store float %19, float* %scevgep17, align 4, !tbaa !0
+  %scevgep7 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp
+  %scevgep8 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp
+  %tmp929 = or i64 %tmp, 1
+  %scevgep10 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp929
+  %scevgep11 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp929
+  %scevgep12 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp929
+  %tmp1330 = or i64 %tmp, 2
+  %scevgep14 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1330
+  %scevgep15 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1330
+  %scevgep16 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1330
+  %tmp1731 = or i64 %tmp, 3
+  %scevgep18 = getelementptr [40000 x float]* %3, i64 0, i64 %tmp1731
+  %scevgep19 = getelementptr [40000 x float]* %2, i64 0, i64 %tmp1731
+  %scevgep20 = getelementptr [40000 x float]* %1, i64 0, i64 %tmp1731
+  %9 = load float* %scevgep8, align 16, !tbaa !0
+  %10 = load float* %scevgep7, align 16, !tbaa !0
+  %11 = fadd float %9, %10
+  store float %11, float* %scevgep, align 16, !tbaa !0
+  %12 = load float* %scevgep12, align 4, !tbaa !0
+  %13 = load float* %scevgep11, align 4, !tbaa !0
+  %14 = fadd float %12, %13
+  store float %14, float* %scevgep10, align 4, !tbaa !0
+  %15 = load float* %scevgep16, align 8, !tbaa !0
+  %16 = load float* %scevgep15, align 8, !tbaa !0
+  %17 = fadd float %15, %16
+  store float %17, float* %scevgep14, align 8, !tbaa !0
+  %18 = load float* %scevgep20, align 4, !tbaa !0
+  %19 = load float* %scevgep19, align 4, !tbaa !0
+  %20 = fadd float %18, %19
+  store float %20, float* %scevgep18, align 4, !tbaa !0
  %indvar.next = add i64 %indvar, 1
  %exitcond = icmp eq i64 %indvar.next, 10000
-  br i1 %exitcond, label %20, label %7
-
-; <label>:20                                      ; preds = %7
-  %21 = load float* %.sub3, align 16, !tbaa !0
-  %22 = fpext float %21 to double
-  %23 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 1
-  %24 = load float* %23, align 4, !tbaa !0
-  %25 = fpext float %24 to double
-  %26 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 2
-  %27 = load float* %26, align 8, !tbaa !0
-  %28 = fpext float %27 to double
-  %29 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 3
-  %30 = load float* %29, align 4, !tbaa !0
-  %31 = fpext float %30 to double
-  %32 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str, i64 0, i64 0), double %22, double %25, double %28, double %31) nounwind
+  br i1 %exitcond, label %21, label %8
+
+; <label>:21                                      ; preds = %8
+  %22 = load float* %.sub3, align 16, !tbaa !0
+  %23 = fpext float %22 to double
+  %24 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 1
+  %25 = load float* %24, align 4, !tbaa !0
+  %26 = fpext float %25 to double
+  %27 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 2
+  %28 = load float* %27, align 8, !tbaa !0
+  %29 = fpext float %28 to double
+  %30 = getelementptr inbounds [40000 x float]* %3, i64 0, i64 3
+  %31 = load float* %30, align 4, !tbaa !0
+  %32 = fpext float %31 to double
+  %33 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str2, i64 0, i64 0), double %23, double %26, double %29, double %32) nounwind
  ret i32 0
 }

 declare i32 @printf(i8* nocapture, ...) nounwind

+declare i32 @puts(i8* nocapture) nounwind
+
 !0 = metadata !{metadata !"float", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
@@ -341,6 +347,79 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 Instead of hand-optimizing the entire sequence, the exercise will merely convert the types to vectors and then alter the loop starting with "label 44" to use vector addition rather then the sequence of adds that is currently being used.  The resulting program is as follows:

 ```wiki
+; ModuleID = '/tmp/webcompile/_15374_0.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@.str2 = private unnamed_addr constant [13 x i8] c"%f %f %f %f\0A\00"
+@str = internal constant [24 x i8] c"Entering initialization\00"
+@str3 = internal constant [18 x i8] c"Entering addition\00"
+
+define i32 @main() nounwind {
+; <label>:0
+  %1 = alloca <128 x float>, align 16
+  %2 = alloca <128 x float>, align 16
+  %3 = alloca <128 x float>, align 16
+  %puts = call i32 @puts(i8* getelementptr inbounds ([24 x i8]* @str, i64 0, i64 0))
+  br label %4
+
+; <label>:4                                       ; preds = %4, %0
+  %indvar21 = phi i64 [ 0, %0 ], [ %indvar.next22, %4 ]
+  %i.06 = trunc i64 %indvar21 to i32
+  %scevgep25 = getelementptr <128 x float>* %2, i64 0, i64 %indvar21
+  %scevgep26 = getelementptr <128 x float>* %1, i64 0, i64 %indvar21
+  %tmp27 = add i64 %indvar21, 128
+  %tmp28 = trunc i64 %tmp27 to i32
+  %5 = sitofp i32 %i.06 to float
+  store float %5, float* %scevgep26, align 4, !tbaa !0
+  %6 = sitofp i32 %tmp28 to float
+  store float %6, float* %scevgep25, align 4, !tbaa !0
+  %indvar.next22 = add i64 %indvar21, 1
+  %exitcond23 = icmp eq i64 %indvar.next22, 128
+  br i1 %exitcond23, label %7, label %4
+
+; <label>:7                                       ; preds = %4
+  %.sub3 = getelementptr inbounds <128 x float>* %3, i64 0, i64 0
+  %puts4 = call i32 @puts(i8* getelementptr inbounds ([18 x i8]* @str3, i64 0, i64 0))
+  br label %8
+
+; <label>:8                                       ; preds = %8, %7
+;  %indvar = phi i64 [ 0, %7 ], [ %indvar.next, %8 ]
+
+
+;  %indvar.next = add i64 %indvar, 1
+;  %exitcond = icmp eq i64 %indvar.next, 10000
+;  br i1 %exitcond, label %9, label %8
+   br label %9
+
+; <label>:9                                      ; preds = %8
+  %xs = load <128 x float>* %1
+  %ys = load <128 x float>* %2
+  %zs = fadd <128 x float> %xs, %ys
+  store <128 x float> %zs, <128 x float>* %3
+
+  %10 = load float* %.sub3, align 16, !tbaa !0
+  %11 = fpext float %10 to double
+  %12 = getelementptr inbounds <128 x float>* %3, i64 0, i64 1
+  %13 = load float* %12, align 4, !tbaa !0
+  %14 = fpext float %13 to double
+  %15 = getelementptr inbounds <128 x float>* %3, i64 0, i64 2
+  %16 = load float* %15, align 8, !tbaa !0
+  %17 = fpext float %16 to double
+  %18 = getelementptr inbounds <128 x float>* %3, i64 0, i64 3
+  %19 = load float* %18, align 4, !tbaa !0
+  %20 = fpext float %19 to double
+  %21 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str2, i64 0, i64 0), double %11, double %14, double %17, double %20) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare i32 @puts(i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
 ```


@@ -349,7 +428,9 @@ Timing the execution of the optimized vs. non-optimized bytecodes yields:

 Finally, a note on converting from arrays to vectors and subsequently optimizing to use vector adds.  The simplest way to do this was to:

- convert the code to multiply 4 of the array values at a time
+- convert the code to multiply 4 of the array values at a time (depends on the register sizes, data sizes, etc...)

-  - convert the array types to vector (\[4000 x float\] becomes \<4000 x float\>), the program will work AS-IS with this simple conversion
+  - convert the array types to vector (\[128 x float\] becomes \<128 x float\>), the program will work AS-IS with this simple conversion
  - work through the loop again to move to a load of the proper location in the vector to a packed vector, then do the fadd of the vectors
+  - vector sizes MUST be the size of a power of 2 (1, 2, 4, 8, 16, ....)
+  - vector sizes seem to be limited, 32768 definitely did NOT work, 128 is working