Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_divc.neon.c
1 /*
2  * Copyright 2011-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : math/NE10_divc.neon.c
30  */
31 
32 #include "NE10_types.h"
33 #include "macros.h"
34 
35 #include <assert.h>
36 #include <arm_neon.h>
37 
38 #include <stdio.h>
39 #include <stdlib.h>
40 
41 ne10_result_t ne10_divc_float_neon (ne10_float32_t * dst, ne10_float32_t * src, const ne10_float32_t cst, ne10_uint32_t count)
42 {
43  NE10_XC_OPERATION_FLOAT_NEON
44  (
45  /* a single division operation */
46  float32x4_t rec = vrecpeq_f32 (n_cst);
47  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
48  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
49  n_dst = vmulq_f32 (n_src , rec);
50  ,
51  /* a single division operation */
52  float32x2_t rec = vrecpe_f32 (n_tmp_cst);
53  rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
54  rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
55  n_tmp_src = vmul_f32 (n_tmp_src, rec);
56  );
57 }
58 
59 ne10_result_t ne10_divc_vec2f_neon (ne10_vec2f_t * dst, ne10_vec2f_t * src, const ne10_vec2f_t * cst, ne10_uint32_t count)
60 {
61  NE10_XC_OPERATION_VEC2F_NEON
62  (
63  /* a single division operation */
64  float32x4_t rec = vrecpeq_f32 (n_cst);
65  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
66  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
67  n_dst = vmulq_f32 (n_src , rec);
68  ,
69  /* a single division operation */
70  float32x2_t rec = vrecpe_f32 (n_tmp_cst);
71  rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
72  rec = vmul_f32 (vrecps_f32 (n_tmp_cst, rec), rec);
73  n_tmp_src = vmul_f32 (n_tmp_src, rec);
74  );
75 }
76 
77 ne10_result_t ne10_divc_vec3f_neon (ne10_vec3f_t * dst, ne10_vec3f_t * src, const ne10_vec3f_t * cst, ne10_uint32_t count)
78 {
79  NE10_XC_OPERATION_VEC3F_NEON
80  (
81  /* three division operations */
82  float32x4_t rec = vrecpeq_f32 (n_cst1);
83  rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
84  rec = vmulq_f32 (vrecpsq_f32 (n_cst1, rec), rec);
85  n_dst1 = vmulq_f32 (n_src1 , rec);
86 
87  rec = vrecpeq_f32 (n_cst2);
88  rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
89  rec = vmulq_f32 (vrecpsq_f32 (n_cst2, rec), rec);
90  n_dst2 = vmulq_f32 (n_src2 , rec);
91 
92  rec = vrecpeq_f32 (n_cst3);
93  rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
94  rec = vmulq_f32 (vrecpsq_f32 (n_cst3, rec), rec);
95  n_dst3 = vmulq_f32 (n_src3 , rec);
96  ,
97  /* three division operations */
98  float32x2_t rec = vrecpe_f32 (n_tmp_cst.val[0]);
99  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
100  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[0], rec), rec);
101  n_tmp_src.val[0] = vmul_f32 (n_tmp_src.val[0] , rec);
102 
103  rec = vrecpe_f32 (n_tmp_cst.val[1]);
104  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
105  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[1], rec), rec);
106  n_tmp_src.val[1] = vmul_f32 (n_tmp_src.val[1] , rec);
107 
108  rec = vrecpe_f32 (n_tmp_cst.val[2]);
109  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
110  rec = vmul_f32 (vrecps_f32 (n_tmp_cst.val[2], rec), rec);
111  n_tmp_src.val[2] = vmul_f32 (n_tmp_src.val[2] , rec);
112  );
113 }
114 
115 ne10_result_t ne10_divc_vec4f_neon (ne10_vec4f_t * dst, ne10_vec4f_t * src, const ne10_vec4f_t * cst, ne10_uint32_t count)
116 {
117  NE10_XC_OPERATION_VEC4F_NEON
118  (
119  /* a single division operation */
120  float32x4_t rec = vrecpeq_f32 (n_cst);
121  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
122  rec = vmulq_f32 (vrecpsq_f32 (n_cst, rec), rec);
123  n_dst = vmulq_f32 (n_src , rec);
124  );
125 }
a 2-tuple of ne10_float32_t values.
Definition: NE10_types.h:87
a 3-tuple of ne10_float32_t values.
Definition: NE10_types.h:96
a 4-tuple of ne10_float32_t values.
Definition: NE10_types.h:106