[v1,3/3] x86: Optimize memchr-evex.S

Message ID 20210503084435.160548-3-goldstein.w.n@gmail.com
State New
Headers show
Series
  • [v1,1/3] Bench: Expand bench-memchr.c
Related show

Commit Message

Adhemerval Zanella via Libc-alpha May 3, 2021, 8:44 a.m.
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

---
Tests where run on the following CPUs:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

All times are the geometric mean of N=20. The unit of time is
seconds.

"Cur" refers to the current implementation
"New" refers to this patches implementation

Note: The numbers for size = [1, 32] are highly dependent on function
alignment. That being said the new implementation which uses cmovcc
instead of a branch (mostly for the reason of high variance with
different alignments) for the [1, 32] case is far more consistent and
performs about as well (and should only be a bigger improvement in
cases where the sizes / position are not 100% predictable).

For memchr-evex the numbers are a near universal improvement. The case
where the current implement as better is for size = 0 and for size =
[1, 32] with pos < size the two implementations are about the
same. For size = [1, 32] with pos > size, for medium range sizes, and
large size, however, the new implementation is faster.

Results For Tigerlake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  
256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  
2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  
256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  
2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  
256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  
2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  
256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   
2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  
256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  
2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  
256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  
2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  
256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  
192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  
256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  
512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  
192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   
512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   
192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  
256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  
512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  
192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  
256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   
512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   
192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  
256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  
512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   
192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  
256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  
512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  
192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  
256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  
512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  
2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  
0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  
3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   
1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   
4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   
2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  
5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  
5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  
3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  
6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  
6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  
4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  
7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  
5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  
8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  
8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  
6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   
9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  
10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  
8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  
11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  
9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  
9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  
12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  
10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  
13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  
13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  
11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  
11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  
14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  
14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  
12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   
12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  
15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  
13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  
13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  
16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  
14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  
14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  
17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  
15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  
18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  
16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  
19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  
19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   
17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  
20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  
18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  
21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  
19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  
22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  
20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  
23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  
23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  
21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  
24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  
24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  
22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  
22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  
25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  
23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  
26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  
26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  
24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  
24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  
27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  
27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  
25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  
25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  
28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  
26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  
29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  
27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  
30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  
28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  
31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  
29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  
32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  
32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  
30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  
30    , 7     , , 31    3.68  , 3.06  , New   , 0.62

Results For Icelake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  
256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  
2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  
256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  
2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  
256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  
2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  
256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  
2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  
256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  
2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  
256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  
2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  
256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  
192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  
256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  
512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  
192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  
512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  
192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  
256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  
512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  
192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  
256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  
512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  
192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  
256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  
512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  
192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  
256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  
512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  
192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   
256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  
512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  
2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  
2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  
0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  
0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  
3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  
3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  
1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  
1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  
4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  
4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  
2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  
2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  
5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  
5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  
3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  
3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  
6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  
6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  
4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  
4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  
7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  
7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  
5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  
5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  
8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  
8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  
6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  
6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  
9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  
7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  
10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  
10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  
8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  
8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  
11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  
11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  
9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  
9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  
12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  
12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  
10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  
10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  
13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  
13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  
11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  
11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  
14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  
14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  
12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  
12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  
15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  
15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  
13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  
13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  
16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  
16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  
14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  
14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  
17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  
15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  
18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  
18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  
16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  
16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  
19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  
19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  
17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  
17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  
20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  
20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  
18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  
18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  
21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  
21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  
19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  
19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  
22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  
22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  
20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   
20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  
23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  
23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  
21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  
21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  
24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  
24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  
22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  
22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  
25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  
23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   
26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  
26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  
24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  
24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   
27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  
27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  
25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  
25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  
28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  
28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   
26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  
29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  
29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  
27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  
27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  
30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  
30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  
28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  
28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  
31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  
31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  
29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  
29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  
32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  
32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  
30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  
30    , 7     , , 31    3.24  , 2.72  , New   , 0.52

For memchr-avx2 the improvements are more modest though again near
universal. The improvement is most significant for medium sizes and
small sizes with pos > size. For small sizes with pos < size and large
sizes the two implementations perform roughly the same for large
sizes.

Results For Tigerlake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  
256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  
2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  
256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  
2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   
256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  
2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  
2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  
256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  
2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   
256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   
2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  
256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  
192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  
256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  
512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  
192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  
512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  
192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  
256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  
512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  
192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  
256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  
512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  
192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  
256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  
512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  
192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  
256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  
512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  
192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  
256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  
512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  
2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  
2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  
0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  
1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  
1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  
4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  
2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   
2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   
5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  
5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  
3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  
3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   
6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  
6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  
4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   
4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  
7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  
7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  
5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  
5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  
8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  
8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  
6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  
9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  
10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  
8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  
11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  
11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  
9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  
9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  
12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  
12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  
10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  
10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  
13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  
13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   
11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  
14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  
14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  
12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  
15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  
15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  
13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  
16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  
16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  
14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  
14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  
17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  
15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   
18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  
18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  
16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  
16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  
19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  
19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  
17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  
17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   
20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  
20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  
18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  
18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  
21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  
21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   
19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  
22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  
22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  
20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  
20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  
23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  
23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  
21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  
21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  
24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  
24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  
22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  
22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  
25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  
23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  
26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  
26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  
24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  
24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  
27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  
27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  
25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  
25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  
28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  
28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  
26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  
26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  
29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  
29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  
27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  
27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  
30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  
30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  
28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  
28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  
31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  
31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  
29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  
29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  
32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  
32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  
30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  
30    , 7     , , 31    5.28  , 4.08  , New   , 1.2

Results For Icelake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  
256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  
2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  
256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  
2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  
256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  
2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   
256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   
2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  
256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  
2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  
256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  
2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  
256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  
192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  
256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  
512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  
192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  
512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  
192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  
256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  
512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  
192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   
256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  
512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  
192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  
256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  
512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  
192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  
256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  
512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  
192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  
256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  
512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  
2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  
2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  
0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  
0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  
3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  
3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  
1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  
1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  
4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  
4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  
2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  
2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  
5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  
5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  
3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  
3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  
6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  
6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  
4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  
4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   
7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  
7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  
5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  
5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  
8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  
8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  
6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  
6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  
9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  
7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  
10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  
10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  
8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  
8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   
11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  
11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  
9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  
9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  
12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  
12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  
10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   
10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  
13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  
11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  
11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  
14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  
14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  
12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  
15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  
15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  
13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  
13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  
16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  
16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  
14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  
14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  
17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  
15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  
18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  
18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  
16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  
16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  
19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  
19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  
17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  
17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  
20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  
20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  
18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   
21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  
21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  
19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  
19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  
22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  
22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  
20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  
20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  
23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  
21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  
21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  
24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  
24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  
22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   
22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  
25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  
23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  
26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  
26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  
24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  
24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   
27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  
27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  
25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   
25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  
28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  
28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  
26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  
26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  
29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  
29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  
27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  
30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  
30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  
28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  
28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  
31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  
29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  
29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  
32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  
32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  
30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  
30    , 7     , , 31    4.48  , 3.44  , New   , 1.04


Results For Skylake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  
256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  
2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  
256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  
2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  
256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  
2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  
256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  
2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   
256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  
2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   
256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  
2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  
256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  
192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  
256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  
512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  
192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  
512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  
192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  
256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  
512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  
192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  
256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  
512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  
192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  
256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  
512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  
192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  
256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  
512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  
192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  
256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  
512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  
2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  
2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  
0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  
0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  
3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  
3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  
1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  
1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  
4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  
4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  
2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  
2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  
5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  
5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  
3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  
3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  
6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  
6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  
4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  
4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  
7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  
7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  
5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  
5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  
8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  
8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  
6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  
6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  
9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  
7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  
10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  
10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  
8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  
8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  
11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  
11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  
9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  
9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  
12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  
12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  
10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  
10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  
13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  
13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  
11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  
11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  
14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  
14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  
12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   
12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   
15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  
15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  
13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  
13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  
16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  
16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  
14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  
14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  
17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   
15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  
18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  
18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   
16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  
16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  
19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  
19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  
17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  
17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  
20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  
20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  
18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   
18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  
21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   
21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  
19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   
19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  
22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  
22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  
20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  
20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  
23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  
23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  
21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  
21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  
24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  
24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  
22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   
22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  
25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  
23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  
26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   
26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  
24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  
24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  
27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  
27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  
25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   
25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  
28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   
28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  
26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   
26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  
29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  
29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  
27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  
27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  
30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   
30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  
28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  
28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   
31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  
31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  
29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  
29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  
32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  
32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  
30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  
30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  

 sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
 1 file changed, 349 insertions(+), 231 deletions(-)

-- 
2.29.2

Comments

Adhemerval Zanella via Libc-alpha May 3, 2021, 6:58 p.m. | #1
On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include

> replacing some branches with cmovcc, avoiding some branches entirely

> in the less_4x_vec case, making the page cross logic less strict,

> saving some ALU in the alignment process, and most importantly

> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and

> test-wmemchr are all passing.

> 

> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

> ---

> Tests where run on the following CPUs:

> 

> Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

> 

> Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

> 

> Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

> 

> All times are the geometric mean of N=20. The unit of time is

> seconds.

> 

> "Cur" refers to the current implementation

> "New" refers to this patches implementation

> 

> Note: The numbers for size = [1, 32] are highly dependent on function

> alignment. That being said the new implementation which uses cmovcc

> instead of a branch (mostly for the reason of high variance with

> different alignments) for the [1, 32] case is far more consistent and

> performs about as well (and should only be a bigger improvement in

> cases where the sizes / position are not 100% predictable).

> 

> For memchr-evex the numbers are a near universal improvement. The case

> where the current implement as better is for size = 0 and for size =

> [1, 32] with pos < size the two implementations are about the

> same. For size = [1, 32] with pos > size, for medium range sizes, and

> large size, however, the new implementation is faster.

> 

> Results For Tigerlake memchr-evex

> size  , algn  , Pos   , Cur T , New T , Win   , Dif   

> 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  

> 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  

> 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  

> 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  

> 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  

> 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  

> 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  

> 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   

> 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  

> 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  

> 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  

> 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  

> 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  

> 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  

> 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  

> 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  

> 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  

> 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   

> 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   

> 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  

> 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  

> 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  

> 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  

> 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   

> 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   

> 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  

> 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  

> 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   

> 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  

> 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  

> 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  

> 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  

> 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  

> 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  

> 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0

> 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0

> 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  

> 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  

> 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0

> 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0

> 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   

> 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   

> 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0

> 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0

> 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   

> 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  

> 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  

> 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0

> 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  

> 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  

> 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  

> 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0

> 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  

> 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  

> 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0

> 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0

> 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  

> 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  

> 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  

> 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0

> 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  

> 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   

> 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0

> 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  

> 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0

> 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0

> 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  

> 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  

> 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0

> 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  

> 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  

> 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  

> 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0

> 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0

> 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  

> 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  

> 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  

> 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  

> 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  

> 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  

> 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  

> 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  

> 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   

> 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  

> 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0

> 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  

> 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  

> 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  

> 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0

> 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  

> 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  

> 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  

> 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  

> 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  

> 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0

> 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0

> 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  

> 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  

> 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  

> 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0

> 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   

> 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  

> 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0

> 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0

> 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  

> 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  

> 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0

> 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0

> 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  

> 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  

> 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0

> 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0

> 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  

> 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  

> 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  

> 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0

> 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  

> 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  

> 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  

> 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  

> 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  

> 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  

> 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  

> 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  

> 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  

> 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  

> 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  

> 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  

> 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  

> 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  

> 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  

> 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  

> 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0

> 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0

> 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  

> 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  

> 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0

> 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0

> 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  

> 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  

> 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0

> 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0

> 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  

> 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  

> 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0

> 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0

> 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  

> 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  

> 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  

> 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  

> 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  

> 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62

> 

> Results For Icelake memchr-evex

> size  , algn  , Pos   , Cur T , New T , Win   , Dif   

> 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  

> 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  

> 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  

> 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  

> 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  

> 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  

> 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  

> 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  

> 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  

> 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  

> 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  

> 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  

> 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  

> 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  

> 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  

> 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  

> 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  

> 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  

> 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  

> 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  

> 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  

> 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  

> 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  

> 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  

> 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  

> 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  

> 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  

> 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  

> 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  

> 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  

> 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  

> 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   

> 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  

> 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  

> 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  

> 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  

> 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  

> 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  

> 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  

> 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  

> 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  

> 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  

> 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  

> 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  

> 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  

> 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  

> 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  

> 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  

> 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  

> 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  

> 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  

> 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  

> 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  

> 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  

> 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  

> 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  

> 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  

> 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  

> 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  

> 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  

> 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  

> 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  

> 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  

> 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  

> 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  

> 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  

> 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  

> 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  

> 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  

> 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  

> 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  

> 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  

> 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  

> 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  

> 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  

> 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  

> 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  

> 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  

> 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  

> 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  

> 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  

> 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  

> 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  

> 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  

> 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  

> 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  

> 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  

> 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  

> 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  

> 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  

> 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  

> 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  

> 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  

> 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  

> 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  

> 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  

> 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  

> 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  

> 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  

> 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  

> 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  

> 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  

> 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  

> 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  

> 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  

> 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  

> 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  

> 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  

> 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  

> 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  

> 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  

> 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  

> 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   

> 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  

> 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  

> 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  

> 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  

> 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  

> 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  

> 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  

> 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  

> 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  

> 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  

> 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   

> 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  

> 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  

> 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  

> 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   

> 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  

> 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  

> 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  

> 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  

> 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  

> 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0

> 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   

> 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  

> 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  

> 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  

> 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  

> 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  

> 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  

> 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  

> 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  

> 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  

> 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  

> 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  

> 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  

> 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  

> 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  

> 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  

> 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  

> 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52

> 

> For memchr-avx2 the improvements are more modest though again near

> universal. The improvement is most significant for medium sizes and

> small sizes with pos > size. For small sizes with pos < size and large

> sizes the two implementations perform roughly the same for large

> sizes.

> 

> Results For Tigerlake memchr-avx2

> size  , algn  , Pos   , Cur T , New T , Win   , Dif   

> 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  

> 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  

> 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  

> 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  

> 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   

> 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  

> 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0

> 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  

> 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  

> 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  

> 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   

> 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   

> 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  

> 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  

> 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  

> 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  

> 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  

> 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  

> 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  

> 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  

> 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  

> 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  

> 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  

> 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  

> 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  

> 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  

> 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  

> 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  

> 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  

> 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  

> 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  

> 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  

> 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  

> 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  

> 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  

> 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0

> 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  

> 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0

> 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0

> 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  

> 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  

> 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  

> 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0

> 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  

> 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   

> 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   

> 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  

> 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  

> 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  

> 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   

> 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  

> 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  

> 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   

> 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  

> 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  

> 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  

> 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  

> 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  

> 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  

> 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0

> 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  

> 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  

> 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0

> 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  

> 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0

> 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0

> 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  

> 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  

> 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  

> 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  

> 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  

> 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  

> 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  

> 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  

> 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  

> 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  

> 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  

> 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0

> 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   

> 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  

> 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  

> 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0

> 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  

> 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  

> 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  

> 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0

> 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  

> 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  

> 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  

> 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  

> 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  

> 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  

> 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  

> 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   

> 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  

> 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  

> 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  

> 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  

> 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  

> 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  

> 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  

> 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   

> 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  

> 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  

> 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  

> 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  

> 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  

> 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0

> 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   

> 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  

> 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  

> 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  

> 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  

> 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  

> 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  

> 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  

> 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  

> 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  

> 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  

> 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  

> 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  

> 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  

> 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  

> 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  

> 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  

> 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  

> 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  

> 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  

> 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  

> 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  

> 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  

> 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  

> 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  

> 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  

> 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  

> 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  

> 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  

> 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  

> 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  

> 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  

> 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  

> 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  

> 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  

> 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  

> 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  

> 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  

> 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  

> 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  

> 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  

> 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  

> 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  

> 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2

> 

> Results For Icelake memchr-avx2

> size  , algn  , Pos   , Cur T , New T , Win   , Dif   

> 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  

> 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  

> 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  

> 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  

> 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  

> 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  

> 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   

> 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   

> 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  

> 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  

> 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  

> 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  

> 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  

> 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  

> 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  

> 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  

> 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  

> 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  

> 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  

> 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  

> 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  

> 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  

> 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   

> 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  

> 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  

> 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  

> 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  

> 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  

> 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  

> 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  

> 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  

> 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  

> 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  

> 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  

> 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  

> 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  

> 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  

> 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  

> 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  

> 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  

> 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  

> 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  

> 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  

> 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  

> 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  

> 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  

> 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  

> 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  

> 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  

> 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  

> 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  

> 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  

> 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  

> 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   

> 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  

> 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  

> 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  

> 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  

> 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  

> 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  

> 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  

> 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  

> 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  

> 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  

> 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  

> 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  

> 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  

> 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   

> 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  

> 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  

> 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  

> 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  

> 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  

> 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  

> 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   

> 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  

> 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0

> 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  

> 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  

> 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  

> 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  

> 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0

> 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  

> 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  

> 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  

> 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  

> 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  

> 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  

> 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  

> 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  

> 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  

> 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  

> 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  

> 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  

> 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  

> 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  

> 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  

> 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  

> 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  

> 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  

> 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  

> 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  

> 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  

> 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0

> 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  

> 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   

> 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  

> 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  

> 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  

> 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  

> 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  

> 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  

> 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  

> 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  

> 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0

> 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  

> 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  

> 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  

> 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  

> 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  

> 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   

> 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  

> 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  

> 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  

> 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  

> 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  

> 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  

> 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   

> 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  

> 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  

> 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   

> 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  

> 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  

> 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  

> 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  

> 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  

> 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  

> 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0

> 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  

> 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  

> 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  

> 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  

> 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  

> 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  

> 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0

> 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  

> 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  

> 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  

> 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  

> 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  

> 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  

> 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04

> 

> 

> Results For Skylake memchr-avx2

> size  , algn  , Pos   , Cur T , New T , Win   , Dif   

> 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  

> 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  

> 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  

> 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  

> 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  

> 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  

> 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  

> 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  

> 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   

> 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  

> 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   

> 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  

> 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  

> 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  

> 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  

> 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  

> 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  

> 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  

> 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  

> 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  

> 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  

> 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  

> 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  

> 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  

> 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  

> 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  

> 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  

> 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  

> 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  

> 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  

> 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  

> 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  

> 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  

> 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  

> 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  

> 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  

> 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  

> 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  

> 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  

> 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  

> 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  

> 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  

> 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  

> 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  

> 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  

> 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  

> 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  

> 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  

> 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  

> 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  

> 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  

> 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  

> 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  

> 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  

> 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  

> 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  

> 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  

> 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  

> 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  

> 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  

> 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  

> 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  

> 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  

> 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  

> 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  

> 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  

> 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  

> 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  

> 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  

> 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  

> 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  

> 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  

> 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  

> 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  

> 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  

> 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  

> 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  

> 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  

> 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  

> 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  

> 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  

> 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  

> 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   

> 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   

> 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  

> 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  

> 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  

> 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  

> 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  

> 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  

> 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  

> 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  

> 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   

> 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  

> 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  

> 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   

> 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  

> 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  

> 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  

> 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  

> 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  

> 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  

> 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  

> 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  

> 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   

> 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  

> 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   

> 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  

> 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   

> 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  

> 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  

> 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  

> 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  

> 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  

> 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  

> 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  

> 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  

> 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  

> 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  

> 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  

> 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   

> 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  

> 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  

> 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  

> 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   

> 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  

> 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  

> 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  

> 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  

> 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  

> 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   

> 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  

> 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   

> 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  

> 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   

> 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  

> 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  

> 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  

> 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  

> 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  

> 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   

> 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  

> 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  

> 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   

> 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  

> 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  

> 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  

> 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  

> 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  

> 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  

> 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  

> 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  

> 

>  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------

>  1 file changed, 349 insertions(+), 231 deletions(-)

> 

> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S

> index 6dd5d67b90..65c16ef8a4 100644

> --- a/sysdeps/x86_64/multiarch/memchr-evex.S

> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S

> @@ -26,14 +26,28 @@

>  

>  # ifdef USE_AS_WMEMCHR

>  #  define VPBROADCAST	vpbroadcastd

> -#  define VPCMP		vpcmpd

> -#  define SHIFT_REG	r8d

> +#  define VPMINU	vpminud

> +#  define VPCMP	vpcmpd

> +#  define VPCMPEQ	vpcmpeqd

> +#  define CHAR_SIZE	4

>  # else

>  #  define VPBROADCAST	vpbroadcastb

> -#  define VPCMP		vpcmpb

> -#  define SHIFT_REG	ecx

> +#  define VPMINU	vpminub

> +#  define VPCMP	vpcmpb

> +#  define VPCMPEQ	vpcmpeqb

> +#  define CHAR_SIZE	1

>  # endif

>  

> +# ifdef USE_AS_RAWMEMCHR

> +#  define RAW_PTR_REG	rcx

> +#  define ALGN_PTR_REG	rdi

> +# else

> +#  define RAW_PTR_REG	rdi

> +#  define ALGN_PTR_REG	rcx

> +# endif

> +

> +#define XZERO		xmm23


Add a space before define.  Rename XZERO to XMMZERO.

> +#define YZERO		ymm23


Add a space before define.  Rename YZERO to YMMZERO.

>  # define XMMMATCH	xmm16

>  # define YMMMATCH	ymm16

>  # define YMM1		ymm17

> @@ -44,18 +58,16 @@

>  # define YMM6		ymm22

>  

>  # define VEC_SIZE 32

> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)

> +# define PAGE_SIZE 4096

>  

>  	.section .text.evex,"ax",@progbits

> -ENTRY (MEMCHR)

> +ENTRY(MEMCHR)


No need for this change.

>  # ifndef USE_AS_RAWMEMCHR

>  	/* Check for zero length.  */

>  	test	%RDX_LP, %RDX_LP

>  	jz	L(zero)

> -# endif

> -	movl	%edi, %ecx

> -# ifdef USE_AS_WMEMCHR

> -	shl	$2, %RDX_LP

> -# else

> +

>  #  ifdef __ILP32__

>  	/* Clear the upper 32 bits.  */

>  	movl	%edx, %edx

> @@ -63,319 +75,425 @@ ENTRY (MEMCHR)

>  # endif

>  	/* Broadcast CHAR to YMMMATCH.  */

>  	VPBROADCAST %esi, %YMMMATCH

> -	/* Check if we may cross page boundary with one vector load.  */

> -	andl	$(2 * VEC_SIZE - 1), %ecx

> -	cmpl	$VEC_SIZE, %ecx

> -	ja	L(cros_page_boundary)

> +	/* Check if we may cross page boundary with one

> +	   vector load.  */


Fit comments to 72 columns.

> +	movl	%edi, %eax

> +	andl	$(PAGE_SIZE - 1), %eax

> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax

> +	ja	L(cross_page_boundary)

>  

>  	/* Check the first VEC_SIZE bytes.  */

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

> -

> +	VPCMP	$0, (%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

>  # ifndef USE_AS_RAWMEMCHR

> -	jnz	L(first_vec_x0_check)

> -	/* Adjust length and check the end of data.  */

> -	subq	$VEC_SIZE, %rdx

> -	jbe	L(zero)

> +	/* If length < CHAR_PER_VEC handle special.  */

> +	cmpq	$CHAR_PER_VEC, %rdx

> +	jbe	L(first_vec_x0)

> +# endif

> +	testl	%eax, %eax

> +	jz	L(aligned_more)

> +	tzcntl	%eax, %eax

> +# ifdef USE_AS_WMEMCHR

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax

>  # else

> -	jnz	L(first_vec_x0)

> +	addq	%rdi, %rax

>  # endif

> -

> -	/* Align data for aligned loads in the loop.  */

> -	addq	$VEC_SIZE, %rdi

> -	andl	$(VEC_SIZE - 1), %ecx

> -	andq	$-VEC_SIZE, %rdi

> +	ret

>  

>  # ifndef USE_AS_RAWMEMCHR

> -	/* Adjust length.  */

> -	addq	%rcx, %rdx

> -

> -	subq	$(VEC_SIZE * 4), %rdx

> -	jbe	L(last_4x_vec_or_less)

> -# endif

> -	jmp	L(more_4x_vec)

> +L(zero):

> +	xorl	%eax, %eax

> +	ret

>  

> +	.p2align 5

> +L(first_vec_x0):

> +	/* Check if first match was before length.  */

> +	tzcntl	%eax, %eax

> +	xorl	%ecx, %ecx

> +	cmpl	%eax, %edx

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax

> +	cmovle	%rcx, %rax

> +	ret

> +# else

> +	/* NB: first_vec_x0 is 17 bytes which will leave

> +	   cross_page_boundary (which is relatively cold) close

> +	   enough to ideal alignment. So only realign

> +	   L(cross_page_boundary) if rawmemchr.  */


Fit comments to 72 columns.

>  	.p2align 4

> -L(cros_page_boundary):

> -	andl	$(VEC_SIZE - 1), %ecx

> +# endif

> +L(cross_page_boundary):

> +	/* Save pointer before aligning as its original

> +	   value is necessary for computer return address if byte is

> +	   found or adjusting length if it is not and this is

> +	   memchr.  */


Fit comments to 72 columns.

> +	movq	%rdi, %rcx

> +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx

> +	   for memchr and rdi for rawmemchr.  */


Fit comments to 72 columns.

> +	andq	$-VEC_SIZE, %ALGN_PTR_REG

> +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0

> +	kmovd	%k0, %r8d

>  # ifdef USE_AS_WMEMCHR

> -	/* NB: Divide shift count by 4 since each bit in K1 represent 4

> -	   bytes.  */

> -	movl	%ecx, %SHIFT_REG

> -	sarl	$2, %SHIFT_REG

> +	/* NB: Divide shift count by 4 since each bit in

> +	   K0 represent 4 bytes.  */

> +	sarl	$2, %eax

> +# endif

> +# ifndef USE_AS_RAWMEMCHR

> +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi

> +	subl	%eax, %esi

>  # endif

> -	andq	$-VEC_SIZE, %rdi

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	/* Remove the leading bytes.  */

> -	sarxl	%SHIFT_REG, %eax, %eax

> -	testl	%eax, %eax

> -	jz	L(aligned_more)

> -	tzcntl	%eax, %eax

>  # ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	sall	$2, %eax

> +	andl	$(CHAR_PER_VEC - 1), %eax

>  # endif

> +	/* Remove the leading bytes.  */

> +	sarxl	%eax, %r8d, %eax

>  # ifndef USE_AS_RAWMEMCHR

>  	/* Check the end of data.  */

> -	cmpq	%rax, %rdx

> -	jbe	L(zero)

> +	cmpq	%rsi, %rdx

> +	jbe	L(first_vec_x0)

> +# endif

> +	testl	%eax, %eax

> +	jz	L(cross_page_continue)

> +	tzcntl	%eax, %eax

> +# ifdef USE_AS_WMEMCHR

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */

> +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax

> +# else

> +	addq	%RAW_PTR_REG, %rax

>  # endif

> -	addq	%rdi, %rax

> -	addq	%rcx, %rax

>  	ret

>  

>  	.p2align 4

> -L(aligned_more):

> -# ifndef USE_AS_RAWMEMCHR

> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"

> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition

> -	   overflow.  */

> -	negq	%rcx

> -	addq	$VEC_SIZE, %rcx

> +L(first_vec_x1):

> +	tzcntl	%eax, %eax

> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> +	ret

>  

> -	/* Check the end of data.  */

> -	subq	%rcx, %rdx

> -	jbe	L(zero)

> -# endif

> +	.p2align 4

> +L(first_vec_x2):

> +	tzcntl	%eax, %eax

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> +	ret

>  

> -	addq	$VEC_SIZE, %rdi

> +	.p2align 4

> +L(first_vec_x3):

> +	tzcntl	%eax, %eax

> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

> +	ret

> +

> +	.p2align 4

> +L(first_vec_x4):

> +	tzcntl	%eax, %eax

> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

> +	ret

> +

> +	.p2align 5

> +L(aligned_more):

> +	/* Check the first 4 * VEC_SIZE.  Only one

> +	   VEC_SIZE at a time since data is only aligned to

> +	   VEC_SIZE.  */


Fit comments to 72 columns.

>  

>  # ifndef USE_AS_RAWMEMCHR

> -	subq	$(VEC_SIZE * 4), %rdx

> +	/* Align data to VEC_SIZE.  */

> +L(cross_page_continue):

> +	xorl	%ecx, %ecx

> +	subl	%edi, %ecx

> +	andq	$-VEC_SIZE, %rdi

> +	/* esi is for adjusting length to see if near the

> +	   end.  */


Fit comments to 72 columns.

> +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi

> +#  ifdef USE_AS_WMEMCHR

> +	/* NB: Divide bytes by 4 to get the wchar_t

> +	   count.  */

> +	sarl	$2, %esi

> +#  endif

> +# else

> +	andq	$-VEC_SIZE, %rdi

> +L(cross_page_continue):

> +# endif

> +	/* Load first VEC regardless.  */

> +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +# ifndef USE_AS_RAWMEMCHR

> +	/* Adjust length. If near end handle specially.

> +	 */


Fit comments to 72 columns.

> +	subq	%rsi, %rdx

>  	jbe	L(last_4x_vec_or_less)

>  # endif

> -

> -L(more_4x_vec):

> -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time

> -	   since data is only aligned to VEC_SIZE.  */

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

> -	jnz	L(first_vec_x0)

> -

> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

>  	testl	%eax, %eax

>  	jnz	L(first_vec_x1)

>  

> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

>  	testl	%eax, %eax

>  	jnz	L(first_vec_x2)

>  

> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

>  	testl	%eax, %eax

>  	jnz	L(first_vec_x3)

>  

> -	addq	$(VEC_SIZE * 4), %rdi

> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	testl	%eax, %eax

> +	jnz	L(first_vec_x4)

> +

>  

>  # ifndef USE_AS_RAWMEMCHR

> -	subq	$(VEC_SIZE * 4), %rdx

> -	jbe	L(last_4x_vec_or_less)

> -# endif

> +	/* Check if at last CHAR_PER_VEC * 4 length.  */

> +	subq	$(CHAR_PER_VEC * 4), %rdx

> +	jbe	L(last_4x_vec_or_less_cmpeq)

> +	addq	$VEC_SIZE, %rdi

>  

> -	/* Align data to 4 * VEC_SIZE.  */

> -	movq	%rdi, %rcx

> -	andl	$(4 * VEC_SIZE - 1), %ecx

> +	/* Align data to VEC_SIZE * 4 for the loop and

> +	   readjust length.  */


Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR

> +	movl	%edi, %ecx

>  	andq	$-(4 * VEC_SIZE), %rdi

> -

> -# ifndef USE_AS_RAWMEMCHR

> -	/* Adjust length.  */

> +	andl	$(VEC_SIZE * 4 - 1), %ecx

> +	/* NB: Divide bytes by 4 to get the wchar_t

> +	   count.  */


Fit comments to 72 columns.

> +	sarl	$2, %ecx

>  	addq	%rcx, %rdx

> +#  else

> +	addq	%rdi, %rdx

> +	andq	$-(4 * VEC_SIZE), %rdi

> +	subq	%rdi, %rdx

> +#  endif

> +# else

> +	addq	$VEC_SIZE, %rdi

> +	andq	$-(4 * VEC_SIZE), %rdi

>  # endif

>  

> +	vpxorq	%XZERO, %XZERO, %XZERO

> +

> +	/* Compare 4 * VEC at a time forward.  */

>  	.p2align 4

>  L(loop_4x_vec):

> -	/* Compare 4 * VEC at a time forward.  */

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2

> -	kord	%k1, %k2, %k5

> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3

> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4

> -

> -	kord	%k3, %k4, %k6

> -	kortestd %k5, %k6

> -	jnz	L(4x_vec_end)

> -

> -	addq	$(VEC_SIZE * 4), %rdi

> -

> +	/* It would be possible to save some instructions

> +	   using 4x VPCMP but bottleneck on port 5 makes it not woth

> +	   it.  */


Fit comments to 72 columns.

> +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1

> +	/* xor will set bytes match esi to zero.  */

> +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2

> +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3

> +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3

> +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero

> +	   mask.  */


Fit comments to 72 columns.

> +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}

> +	VPCMP	$0, %YMM3, %YZERO, %k2

>  # ifdef USE_AS_RAWMEMCHR

> -	jmp	L(loop_4x_vec)

> +	subq	$-(VEC_SIZE * 4), %rdi

> +	kortestd %k2, %k3

> +	jz	L(loop_4x_vec)

>  # else

> -	subq	$(VEC_SIZE * 4), %rdx

> -	ja	L(loop_4x_vec)

> +	kortestd %k2, %k3

> +	jnz	L(loop_4x_vec_end)

>  

> -L(last_4x_vec_or_less):

> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */

> -	addl	$(VEC_SIZE * 2), %edx

> -	jle	L(last_2x_vec)

> +	subq	$-(VEC_SIZE * 4), %rdi

>  

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

> -	jnz	L(first_vec_x0)

> +	subq	$(CHAR_PER_VEC * 4), %rdx

> +	ja	L(loop_4x_vec)

>  

> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> +	/* Fall through into less than 4 remaining

> +	   vectors of length case.  */


Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	addq	$(VEC_SIZE * 3), %rdi

> +	.p2align 4

> +L(last_4x_vec_or_less):

> +	/* Check if first VEC contained match.  */

>  	testl	%eax, %eax

> -	jnz	L(first_vec_x1)

> +	jnz	L(first_vec_x1_check)

>  

> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

> +	/* If remaining length > CHAR_PER_VEC * 2.  */

> +	addl	$(CHAR_PER_VEC * 2), %edx

> +	jg	L(last_4x_vec)

>  

> -	jnz	L(first_vec_x2_check)

> -	subl	$VEC_SIZE, %edx

> -	jle	L(zero)

> +L(last_2x_vec):

> +	/* If remaining length < CHAR_PER_VEC.  */

> +	addl	$CHAR_PER_VEC, %edx

> +	jle	L(zero_end)

> +

> +	/* Check VEC2 and compare any match with

> +	   remaining length.  */


Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	tzcntl	%eax, %eax

> +	cmpl	%eax, %edx

> +	jbe	L(set_zero_end)

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> +L(zero_end):

> +	ret

>  

> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

>  

> -	jnz	L(first_vec_x3_check)

> +	.p2align 4

> +L(first_vec_x1_check):

> +	tzcntl	%eax, %eax

> +	/* Adjust length.  */

> +	subl	$-(CHAR_PER_VEC * 4), %edx

> +	/* Check if match within remaining length.  */

> +	cmpl	%eax, %edx

> +	jbe	L(set_zero_end)

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> +	ret

> +L(set_zero_end):

>  	xorl	%eax, %eax

>  	ret

>  

>  	.p2align 4

> -L(last_2x_vec):

> -	addl	$(VEC_SIZE * 2), %edx

> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1

> +L(loop_4x_vec_end):

> +# endif

> +	/* rawmemchr will fall through into this if match

> +	   was found in loop.  */


Fit comments to 72 columns.

> +

> +	/* k1 has not of matches with VEC1.  */

>  	kmovd	%k1, %eax

> -	testl	%eax, %eax

> +# ifdef USE_AS_WMEMCHR

> +	subl	$((1 << CHAR_PER_VEC) - 1), %eax

> +# else

> +	incl	%eax

> +# endif

> +	jnz	L(last_vec_x1_return)

>  

> -	jnz	L(first_vec_x0_check)

> -	subl	$VEC_SIZE, %edx

> -	jle	L(zero)

> +	VPCMP	$0, %YMM2, %YZERO, %k0

> +	kmovd	%k0, %eax

> +	testl	%eax, %eax

> +	jnz	L(last_vec_x2_return)

>  

> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> -	kmovd	%k1, %eax

> +	kmovd	%k2, %eax

>  	testl	%eax, %eax

> -	jnz	L(first_vec_x1_check)

> -	xorl	%eax, %eax

> -	ret

> +	jnz	L(last_vec_x3_return)

>  

> -	.p2align 4

> -L(first_vec_x0_check):

> +	kmovd	%k3, %eax

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	sall	$2, %eax

> +# ifdef USE_AS_RAWMEMCHR

> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

> +# else

> +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax

>  # endif

> -	/* Check the end of data.  */

> -	cmpq	%rax, %rdx

> -	jbe	L(zero)

> -	addq	%rdi, %rax

>  	ret

>  

>  	.p2align 4

> -L(first_vec_x1_check):

> +L(last_vec_x1_return):

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	sall	$2, %eax

> -# endif

> -	/* Check the end of data.  */

> -	cmpq	%rax, %rdx

> -	jbe	L(zero)

> -	addq	$VEC_SIZE, %rax

> +# ifdef USE_AS_RAWMEMCHR

> +#  ifdef USE_AS_WMEMCHR

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax

> +#  else

>  	addq	%rdi, %rax

> -	ret

> -

> -	.p2align 4

> -L(first_vec_x2_check):

> -	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	sall	$2, %eax

> +#  endif

> +# else

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

>  # endif

> -	/* Check the end of data.  */

> -	cmpq	%rax, %rdx

> -	jbe	L(zero)

> -	addq	$(VEC_SIZE * 2), %rax

> -	addq	%rdi, %rax

>  	ret

>  

>  	.p2align 4

> -L(first_vec_x3_check):

> +L(last_vec_x2_return):

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	sall	$2, %eax

> +# ifdef USE_AS_RAWMEMCHR

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */

> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> +# else

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */

> +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax

>  # endif

> -	/* Check the end of data.  */

> -	cmpq	%rax, %rdx

> -	jbe	L(zero)

> -	addq	$(VEC_SIZE * 3), %rax

> -	addq	%rdi, %rax

>  	ret

>  

>  	.p2align 4

> -L(zero):

> -	xorl	%eax, %eax

> -	ret

> -# endif

> -

> -	.p2align 4

> -L(first_vec_x0):

> +L(last_vec_x3_return):

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	leaq	(%rdi, %rax, 4), %rax

> +# ifdef USE_AS_RAWMEMCHR

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

>  # else

> -	addq	%rdi, %rax

> +	/* NB: Multiply bytes by CHAR_SIZE to get the

> +	   wchar_t count.  */


Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax

>  # endif

>  	ret

>  

> +

> +# ifndef USE_AS_RAWMEMCHR

> +L(last_4x_vec_or_less_cmpeq):

> +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	subq	$-(VEC_SIZE * 4), %rdi

> +	/* Check first VEC regardless.  */

> +	testl	%eax, %eax

> +	jnz	L(first_vec_x1_check)

> +

> +	/* If remaining length <= CHAR_PER_VEC * 2.  */

> +	addl	$(CHAR_PER_VEC * 2), %edx

> +	jle	L(last_2x_vec)

> +

>  	.p2align 4

> -L(first_vec_x1):

> +L(last_4x_vec):

> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	testl	%eax, %eax

> +	jnz	L(last_vec_x2)

> +

> +

> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	/* Create mask for possible matches within

> +	   remaining length.  */


Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR

> +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx

> +	bzhil	%edx, %ecx, %ecx

> +#  else

> +	movq	$-1, %rcx

> +	bzhiq	%rdx, %rcx, %rcx

> +#  endif

> +	/* Test matches in data against length match.  */

> +	andl	%ecx, %eax

> +	jnz	L(last_vec_x3)

> +

> +	/* if remaining length <= CHAR_PER_VEC * 3 (Note

> +	   this is after remaining length was found to be >

> +	   CHAR_PER_VEC * 2.  */


Fit comments to 72 columns.

> +	subl	$CHAR_PER_VEC, %edx

> +	jbe	L(zero_end2)

> +

> +

> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> +	kmovd	%k0, %eax

> +	/* Shift remaining length mask for last VEC.  */

> +#  ifdef USE_AS_WMEMCHR

> +	shrl	$CHAR_PER_VEC, %ecx

> +#  else

> +	shrq	$CHAR_PER_VEC, %rcx

> +#  endif

> +	andl	%ecx, %eax

> +	jz	L(zero_end2)

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax

> -# else

> -	addq	$VEC_SIZE, %rax

> -	addq	%rdi, %rax

> -# endif

> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

> +L(zero_end2):

>  	ret

>  

> -	.p2align 4

> -L(first_vec_x2):

> +L(last_vec_x2):

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax

> -# else

> -	addq	$(VEC_SIZE * 2), %rax

> -	addq	%rdi, %rax

> -# endif

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

>  	ret

>  

>  	.p2align 4

> -L(4x_vec_end):

> -	kmovd	%k1, %eax

> -	testl	%eax, %eax

> -	jnz	L(first_vec_x0)

> -	kmovd	%k2, %eax

> -	testl	%eax, %eax

> -	jnz	L(first_vec_x1)

> -	kmovd	%k3, %eax

> -	testl	%eax, %eax

> -	jnz	L(first_vec_x2)

> -	kmovd	%k4, %eax

> -	testl	%eax, %eax

> -L(first_vec_x3):

> +L(last_vec_x3):

>  	tzcntl	%eax, %eax

> -# ifdef USE_AS_WMEMCHR

> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax

> -# else

> -	addq	$(VEC_SIZE * 3), %rax

> -	addq	%rdi, %rax

> -# endif

> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

>  	ret

> +# endif

>  

> -END (MEMCHR)

> +END(MEMCHR)


No need for this change.

>  #endif

> -- 

> 2.29.2

> 


Thanks.

H.J.
Adhemerval Zanella via Libc-alpha May 3, 2021, 8:06 p.m. | #2
On Mon, May 3, 2021 at 2:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>

> On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:

> > No bug. This commit optimizes memchr-evex.S. The optimizations include

> > replacing some branches with cmovcc, avoiding some branches entirely

> > in the less_4x_vec case, making the page cross logic less strict,

> > saving some ALU in the alignment process, and most importantly

> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and

> > test-wmemchr are all passing.

> >

> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>

> > ---

> > Tests where run on the following CPUs:

> >

> > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

> >

> > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

> >

> > Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

> >

> > All times are the geometric mean of N=20. The unit of time is

> > seconds.

> >

> > "Cur" refers to the current implementation

> > "New" refers to this patches implementation

> >

> > Note: The numbers for size = [1, 32] are highly dependent on function

> > alignment. That being said the new implementation which uses cmovcc

> > instead of a branch (mostly for the reason of high variance with

> > different alignments) for the [1, 32] case is far more consistent and

> > performs about as well (and should only be a bigger improvement in

> > cases where the sizes / position are not 100% predictable).

> >

> > For memchr-evex the numbers are a near universal improvement. The case

> > where the current implement as better is for size = 0 and for size =

> > [1, 32] with pos < size the two implementations are about the

> > same. For size = [1, 32] with pos > size, for medium range sizes, and

> > large size, however, the new implementation is faster.

> >

> > Results For Tigerlake memchr-evex

> > size  , algn  , Pos   , Cur T , New T , Win   , Dif

> > 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36

> > 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29

> > 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33

> > 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33

> > 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63

> > 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32

> > 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15

> > 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3

> > 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85

> > 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31

> > 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68

> > 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23

> > 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63

> > 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41

> > 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68

> > 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67

> > 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65

> > 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3

> > 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4

> > 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42

> > 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37

> > 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35

> > 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57

> > 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6

> > 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6

> > 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71

> > 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15

> > 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6

> > 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47

> > 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45

> > 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04

> > 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59

> > 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49

> > 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73

> > 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0

> > 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0

> > 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59

> > 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59

> > 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0

> > 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0

> > 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6

> > 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6

> > 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0

> > 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0

> > 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6

> > 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59

> > 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01

> > 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0

> > 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61

> > 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59

> > 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01

> > 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0

> > 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61

> > 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61

> > 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0

> > 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0

> > 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63

> > 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61

> > 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01

> > 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0

> > 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63

> > 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6

> > 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0

> > 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62

> > 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0

> > 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0

> > 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61

> > 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61

> > 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0

> > 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01

> > 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62

> > 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61

> > 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0

> > 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0

> > 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61

> > 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61

> > 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01

> > 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01

> > 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56

> > 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56

> > 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03

> > 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01

> > 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6

> > 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59

> > 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0

> > 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01

> > 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61

> > 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62

> > 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0

> > 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01

> > 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62

> > 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61

> > 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01

> > 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62

> > 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0

> > 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0

> > 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61

> > 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62

> > 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01

> > 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0

> > 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6

> > 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62

> > 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0

> > 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0

> > 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62

> > 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62

> > 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0

> > 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0

> > 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61

> > 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61

> > 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0

> > 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0

> > 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62

> > 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62

> > 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01

> > 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0

> > 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61

> > 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61

> > 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13

> > 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02

> > 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63

> > 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62

> > 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01

> > 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62

> > 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01

> > 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01

> > 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62

> > 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62

> > 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06

> > 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02

> > 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63

> > 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61

> > 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0

> > 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0

> > 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61

> > 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61

> > 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0

> > 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0

> > 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63

> > 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61

> > 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0

> > 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0

> > 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61

> > 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62

> > 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0

> > 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0

> > 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62

> > 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64

> > 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11

> > 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06

> > 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62

> > 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62

> >

> > Results For Icelake memchr-evex

> > size  , algn  , Pos   , Cur T , New T , Win   , Dif

> > 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68

> > 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37

> > 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29

> > 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32

> > 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43

> > 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21

> > 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16

> > 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21

> > 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55

> > 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29

> > 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97

> > 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25

> > 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21

> > 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26

> > 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49

> > 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48

> > 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55

> > 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42

> > 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34

> > 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18

> > 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31

> > 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27

> > 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48

> > 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46

> > 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48

> > 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51

> > 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12

> > 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25

> > 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45

> > 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32

> > 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09

> > 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5

> > 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28

> > 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51

> > 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04

> > 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01

> > 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55

> > 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55

> > 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01

> > 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01

> > 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52

> > 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53

> > 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02

> > 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02

> > 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52

> > 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52

> > 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03

> > 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04

> > 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55

> > 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54

> > 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01

> > 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01

> > 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57

> > 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52

> > 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03

> > 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02

> > 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56

> > 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54

> > 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01

> > 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01

> > 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56

> > 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57

> > 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02

> > 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52

> > 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04

> > 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05

> > 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51

> > 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54

> > 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05

> > 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04

> > 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54

> > 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53

> > 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06

> > 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02

> > 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54

> > 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54

> > 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01

> > 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04

> > 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56

> > 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62

> > 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01

> > 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01

> > 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55

> > 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59

> > 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04

> > 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04

> > 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55

> > 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54

> > 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02

> > 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02

> > 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56

> > 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52

> > 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04

> > 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55

> > 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03

> > 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01

> > 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56

> > 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54

> > 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04

> > 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01

> > 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53

> > 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55

> > 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02

> > 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01

> > 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53

> > 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55

> > 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02

> > 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03

> > 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55

> > 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43

> > 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06

> > 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03

> > 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6

> > 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55

> > 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02

> > 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02

> > 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54

> > 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53

> > 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07

> > 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02

> > 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55

> > 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53

> > 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03

> > 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5

> > 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05

> > 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04

> > 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48

> > 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5

> > 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06

> > 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01

> > 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56

> > 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54

> > 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04

> > 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0

> > 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5

> > 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51

> > 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02

> > 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03

> > 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54

> > 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56

> > 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04

> > 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01

> > 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52

> > 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57

> > 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04

> > 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02

> > 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52

> > 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52

> > 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01

> > 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02

> > 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52

> > 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52

> >

> > For memchr-avx2 the improvements are more modest though again near

> > universal. The improvement is most significant for medium sizes and

> > small sizes with pos > size. For small sizes with pos < size and large

> > sizes the two implementations perform roughly the same for large

> > sizes.

> >

> > Results For Tigerlake memchr-avx2

> > size  , algn  , Pos   , Cur T , New T , Win   , Dif

> > 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12

> > 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18

> > 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12

> > 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21

> > 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5

> > 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31

> > 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0

> > 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25

> > 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36

> > 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33

> > 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2

> > 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2

> > 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24

> > 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25

> > 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19

> > 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22

> > 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16

> > 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37

> > 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16

> > 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38

> > 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39

> > 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26

> > 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25

> > 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23

> > 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41

> > 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38

> > 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22

> > 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37

> > 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35

> > 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31

> > 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19

> > 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18

> > 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29

> > 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35

> > 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01

> > 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0

> > 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02

> > 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0

> > 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0

> > 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01

> > 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22

> > 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21

> > 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0

> > 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01

> > 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2

> > 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2

> > 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01

> > 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01

> > 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22

> > 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2

> > 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02

> > 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03

> > 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2

> > 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21

> > 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03

> > 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04

> > 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22

> > 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21

> > 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01

> > 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0

> > 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24

> > 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22

> > 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0

> > 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23

> > 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0

> > 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0

> > 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23

> > 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23

> > 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01

> > 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02

> > 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23

> > 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23

> > 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03

> > 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04

> > 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23

> > 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23

> > 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01

> > 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0

> > 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2

> > 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17

> > 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02

> > 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0

> > 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21

> > 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21

> > 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02

> > 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0

> > 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23

> > 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23

> > 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03

> > 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05

> > 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26

> > 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25

> > 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05

> > 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2

> > 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04

> > 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01

> > 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24

> > 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21

> > 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01

> > 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06

> > 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19

> > 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2

> > 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04

> > 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02

> > 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22

> > 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23

> > 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02

> > 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0

> > 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2

> > 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18

> > 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05

> > 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06

> > 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19

> > 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22

> > 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09

> > 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04

> > 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18

> > 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22

> > 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17

> > 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01

> > 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28

> > 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23

> > 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06

> > 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19

> > 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04

> > 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08

> > 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24

> > 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21

> > 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14

> > 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06

> > 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19

> > 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25

> > 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04

> > 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05

> > 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22

> > 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27

> > 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05

> > 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02

> > 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23

> > 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22

> > 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08

> > 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04

> > 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19

> > 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24

> > 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01

> > 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02

> > 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23

> > 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17

> > 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03

> > 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07

> > 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22

> > 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2

> >

> > Results For Icelake memchr-avx2

> > size  , algn  , Pos   , Cur T , New T , Win   , Dif

> > 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66

> > 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23

> > 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17

> > 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08

> > 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64

> > 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05

> > 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1

> > 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1

> > 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23

> > 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11

> > 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13

> > 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13

> > 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54

> > 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05

> > 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44

> > 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49

> > 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52

> > 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39

> > 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09

> > 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48

> > 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46

> > 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51

> > 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7

> > 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68

> > 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72

> > 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26

> > 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26

> > 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21

> > 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84

> > 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25

> > 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04

> > 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83

> > 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11

> > 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21

> > 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04

> > 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01

> > 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02

> > 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01

> > 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02

> > 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02

> > 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07

> > 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05

> > 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02

> > 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02

> > 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08

> > 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09

> > 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04

> > 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04

> > 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13

> > 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19

> > 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02

> > 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04

> > 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21

> > 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1

> > 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05

> > 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05

> > 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09

> > 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09

> > 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01

> > 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01

> > 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13

> > 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11

> > 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03

> > 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08

> > 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05

> > 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05

> > 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06

> > 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1

> > 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04

> > 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04

> > 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11

> > 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09

> > 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05

> > 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04

> > 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1

> > 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09

> > 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0

> > 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05

> > 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11

> > 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18

> > 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01

> > 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0

> > 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09

> > 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17

> > 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04

> > 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03

> > 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08

> > 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05

> > 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03

> > 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04

> > 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08

> > 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08

> > 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05

> > 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13

> > 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08

> > 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01

> > 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12

> > 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09

> > 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07

> > 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03

> > 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08

> > 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07

> > 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03

> > 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0

> > 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09

> > 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1

> > 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03

> > 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02

> > 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17

> > 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99

> > 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07

> > 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05

> > 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17

> > 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11

> > 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0

> > 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02

> > 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12

> > 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06

> > 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04

> > 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01

> > 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1

> > 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09

> > 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03

> > 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04

> > 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02

> > 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05

> > 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02

> > 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0

> > 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07

> > 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01

> > 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1

> > 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08

> > 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03

> > 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01

> > 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08

> > 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09

> > 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04

> > 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0

> > 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07

> > 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01

> > 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04

> > 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01

> > 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06

> > 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12

> > 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0

> > 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03

> > 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05

> > 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09

> > 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01

> > 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05

> > 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06

> > 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04

> >

> >

> > Results For Skylake memchr-avx2

> > size  , algn  , Pos   , Cur T , New T , Win   , Dif

> > 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21

> > 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84

> > 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56

> > 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65

> > 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18

> > 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56

> > 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17

> > 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68

> > 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7

> > 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58

> > 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1

> > 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44

> > 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31

> > 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41

> > 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71

> > 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52

> > 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65

> > 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68

> > 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57

> > 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86

> > 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11

> > 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12

> > 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02

> > 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07

> > 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98

> > 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42

> > 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14

> > 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34

> > 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06

> > 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95

> > 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82

> > 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24

> > 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95

> > 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91

> > 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66

> > 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57

> > 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52

> > 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51

> > 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49

> > 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53

> > 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51

> > 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58

> > 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55

> > 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54

> > 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54

> > 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53

> > 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58

> > 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57

> > 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51

> > 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53

> > 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53

> > 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53

> > 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51

> > 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52

> > 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56

> > 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56

> > 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52

> > 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54

> > 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51

> > 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57

> > 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54

> > 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51

> > 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51

> > 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52

> > 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53

> > 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52

> > 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52

> > 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52

> > 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62

> > 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46

> > 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53

> > 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54

> > 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57

> > 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57

> > 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52

> > 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55

> > 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49

> > 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47

> > 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45

> > 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47

> > 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48

> > 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57

> > 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5

> > 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5

> > 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56

> > 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53

> > 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51

> > 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52

> > 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57

> > 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57

> > 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55

> > 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54

> > 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5

> > 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58

> > 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39

> > 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5

> > 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52

> > 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54

> > 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56

> > 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52

> > 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53

> > 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43

> > 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47

> > 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46

> > 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7

> > 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52

> > 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4

> > 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67

> > 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5

> > 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53

> > 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48

> > 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56

> > 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48

> > 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52

> > 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49

> > 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45

> > 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68

> > 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48

> > 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71

> > 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44

> > 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5

> > 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53

> > 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45

> > 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51

> > 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4

> > 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62

> > 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39

> > 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55

> > 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45

> > 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37

> > 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5

> > 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54

> > 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6

> > 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62

> > 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4

> > 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58

> > 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38

> > 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57

> > 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55

> > 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42

> > 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6

> > 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67

> > 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59

> > 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5

> > 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56

> > 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62

> > 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61

> > 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37

> > 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45

> > 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48

> > 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75

> > 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49

> >

> >  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------

> >  1 file changed, 349 insertions(+), 231 deletions(-)

> >

> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S

> > index 6dd5d67b90..65c16ef8a4 100644

> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S

> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S

> > @@ -26,14 +26,28 @@

> >

> >  # ifdef USE_AS_WMEMCHR

> >  #  define VPBROADCAST        vpbroadcastd

> > -#  define VPCMP              vpcmpd

> > -#  define SHIFT_REG  r8d

> > +#  define VPMINU     vpminud

> > +#  define VPCMP      vpcmpd

> > +#  define VPCMPEQ    vpcmpeqd

> > +#  define CHAR_SIZE  4

> >  # else

> >  #  define VPBROADCAST        vpbroadcastb

> > -#  define VPCMP              vpcmpb

> > -#  define SHIFT_REG  ecx

> > +#  define VPMINU     vpminub

> > +#  define VPCMP      vpcmpb

> > +#  define VPCMPEQ    vpcmpeqb

> > +#  define CHAR_SIZE  1

> >  # endif

> >

> > +# ifdef USE_AS_RAWMEMCHR

> > +#  define RAW_PTR_REG        rcx

> > +#  define ALGN_PTR_REG       rdi

> > +# else

> > +#  define RAW_PTR_REG        rdi

> > +#  define ALGN_PTR_REG       rcx

> > +# endif

> > +

> > +#define XZERO                xmm23

>

> Add a space before define.  Rename XZERO to XMMZERO.


Done.

>

> > +#define YZERO                ymm23

>

> Add a space before define.  Rename YZERO to YMMZERO.


Done.

>

> >  # define XMMMATCH    xmm16

> >  # define YMMMATCH    ymm16

> >  # define YMM1                ymm17

> > @@ -44,18 +58,16 @@

> >  # define YMM6                ymm22

> >

> >  # define VEC_SIZE 32

> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)

> > +# define PAGE_SIZE 4096

> >

> >       .section .text.evex,"ax",@progbits

> > -ENTRY (MEMCHR)

> > +ENTRY(MEMCHR)

>

> No need for this change.


Fixed.

>

> >  # ifndef USE_AS_RAWMEMCHR

> >       /* Check for zero length.  */

> >       test    %RDX_LP, %RDX_LP

> >       jz      L(zero)

> > -# endif

> > -     movl    %edi, %ecx

> > -# ifdef USE_AS_WMEMCHR

> > -     shl     $2, %RDX_LP

> > -# else

> > +

> >  #  ifdef __ILP32__

> >       /* Clear the upper 32 bits.  */

> >       movl    %edx, %edx

> > @@ -63,319 +75,425 @@ ENTRY (MEMCHR)

> >  # endif

> >       /* Broadcast CHAR to YMMMATCH.  */

> >       VPBROADCAST %esi, %YMMMATCH

> > -     /* Check if we may cross page boundary with one vector load.  */

> > -     andl    $(2 * VEC_SIZE - 1), %ecx

> > -     cmpl    $VEC_SIZE, %ecx

> > -     ja      L(cros_page_boundary)

> > +     /* Check if we may cross page boundary with one

> > +        vector load.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     movl    %edi, %eax

> > +     andl    $(PAGE_SIZE - 1), %eax

> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax

> > +     ja      L(cross_page_boundary)

> >

> >       /* Check the first VEC_SIZE bytes.  */

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > -

> > +     VPCMP   $0, (%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> >  # ifndef USE_AS_RAWMEMCHR

> > -     jnz     L(first_vec_x0_check)

> > -     /* Adjust length and check the end of data.  */

> > -     subq    $VEC_SIZE, %rdx

> > -     jbe     L(zero)

> > +     /* If length < CHAR_PER_VEC handle special.  */

> > +     cmpq    $CHAR_PER_VEC, %rdx

> > +     jbe     L(first_vec_x0)

> > +# endif

> > +     testl   %eax, %eax

> > +     jz      L(aligned_more)

> > +     tzcntl  %eax, %eax

> > +# ifdef USE_AS_WMEMCHR

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax

> >  # else

> > -     jnz     L(first_vec_x0)

> > +     addq    %rdi, %rax

> >  # endif

> > -

> > -     /* Align data for aligned loads in the loop.  */

> > -     addq    $VEC_SIZE, %rdi

> > -     andl    $(VEC_SIZE - 1), %ecx

> > -     andq    $-VEC_SIZE, %rdi

> > +     ret

> >

> >  # ifndef USE_AS_RAWMEMCHR

> > -     /* Adjust length.  */

> > -     addq    %rcx, %rdx

> > -

> > -     subq    $(VEC_SIZE * 4), %rdx

> > -     jbe     L(last_4x_vec_or_less)

> > -# endif

> > -     jmp     L(more_4x_vec)

> > +L(zero):

> > +     xorl    %eax, %eax

> > +     ret

> >

> > +     .p2align 5

> > +L(first_vec_x0):

> > +     /* Check if first match was before length.  */

> > +     tzcntl  %eax, %eax

> > +     xorl    %ecx, %ecx

> > +     cmpl    %eax, %edx

> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax

> > +     cmovle  %rcx, %rax

> > +     ret

> > +# else

> > +     /* NB: first_vec_x0 is 17 bytes which will leave

> > +        cross_page_boundary (which is relatively cold) close

> > +        enough to ideal alignment. So only realign

> > +        L(cross_page_boundary) if rawmemchr.  */

>

> Fit comments to 72 columns.


Fixed.

>

> >       .p2align 4

> > -L(cros_page_boundary):

> > -     andl    $(VEC_SIZE - 1), %ecx

> > +# endif

> > +L(cross_page_boundary):

> > +     /* Save pointer before aligning as its original

> > +        value is necessary for computer return address if byte is

> > +        found or adjusting length if it is not and this is

> > +        memchr.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     movq    %rdi, %rcx

> > +     /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx

> > +        for memchr and rdi for rawmemchr.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     andq    $-VEC_SIZE, %ALGN_PTR_REG

> > +     VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0

> > +     kmovd   %k0, %r8d

> >  # ifdef USE_AS_WMEMCHR

> > -     /* NB: Divide shift count by 4 since each bit in K1 represent 4

> > -        bytes.  */

> > -     movl    %ecx, %SHIFT_REG

> > -     sarl    $2, %SHIFT_REG

> > +     /* NB: Divide shift count by 4 since each bit in

> > +        K0 represent 4 bytes.  */

> > +     sarl    $2, %eax

> > +# endif

> > +# ifndef USE_AS_RAWMEMCHR

> > +     movl    $(PAGE_SIZE / CHAR_SIZE), %esi

> > +     subl    %eax, %esi

> >  # endif

> > -     andq    $-VEC_SIZE, %rdi

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     /* Remove the leading bytes.  */

> > -     sarxl   %SHIFT_REG, %eax, %eax

> > -     testl   %eax, %eax

> > -     jz      L(aligned_more)

> > -     tzcntl  %eax, %eax

> >  # ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     sall    $2, %eax

> > +     andl    $(CHAR_PER_VEC - 1), %eax

> >  # endif

> > +     /* Remove the leading bytes.  */

> > +     sarxl   %eax, %r8d, %eax

> >  # ifndef USE_AS_RAWMEMCHR

> >       /* Check the end of data.  */

> > -     cmpq    %rax, %rdx

> > -     jbe     L(zero)

> > +     cmpq    %rsi, %rdx

> > +     jbe     L(first_vec_x0)

> > +# endif

> > +     testl   %eax, %eax

> > +     jz      L(cross_page_continue)

> > +     tzcntl  %eax, %eax

> > +# ifdef USE_AS_WMEMCHR

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

> > +     leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax

> > +# else

> > +     addq    %RAW_PTR_REG, %rax

> >  # endif

> > -     addq    %rdi, %rax

> > -     addq    %rcx, %rax

> >       ret

> >

> >       .p2align 4

> > -L(aligned_more):

> > -# ifndef USE_AS_RAWMEMCHR

> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"

> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition

> > -        overflow.  */

> > -     negq    %rcx

> > -     addq    $VEC_SIZE, %rcx

> > +L(first_vec_x1):

> > +     tzcntl  %eax, %eax

> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> > +     ret

> >

> > -     /* Check the end of data.  */

> > -     subq    %rcx, %rdx

> > -     jbe     L(zero)

> > -# endif

> > +     .p2align 4

> > +L(first_vec_x2):

> > +     tzcntl  %eax, %eax

> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> > +     ret

> >

> > -     addq    $VEC_SIZE, %rdi

> > +     .p2align 4

> > +L(first_vec_x3):

> > +     tzcntl  %eax, %eax

> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

> > +     ret

> > +

> > +     .p2align 4

> > +L(first_vec_x4):

> > +     tzcntl  %eax, %eax

> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

> > +     ret

> > +

> > +     .p2align 5

> > +L(aligned_more):

> > +     /* Check the first 4 * VEC_SIZE.  Only one

> > +        VEC_SIZE at a time since data is only aligned to

> > +        VEC_SIZE.  */

>

> Fit comments to 72 columns.


Fixed.

>

> >

> >  # ifndef USE_AS_RAWMEMCHR

> > -     subq    $(VEC_SIZE * 4), %rdx

> > +     /* Align data to VEC_SIZE.  */

> > +L(cross_page_continue):

> > +     xorl    %ecx, %ecx

> > +     subl    %edi, %ecx

> > +     andq    $-VEC_SIZE, %rdi

> > +     /* esi is for adjusting length to see if near the

> > +        end.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi

> > +#  ifdef USE_AS_WMEMCHR

> > +     /* NB: Divide bytes by 4 to get the wchar_t

> > +        count.  */

> > +     sarl    $2, %esi

> > +#  endif

> > +# else

> > +     andq    $-VEC_SIZE, %rdi

> > +L(cross_page_continue):

> > +# endif

> > +     /* Load first VEC regardless.  */

> > +     VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +# ifndef USE_AS_RAWMEMCHR

> > +     /* Adjust length. If near end handle specially.

> > +      */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     subq    %rsi, %rdx

> >       jbe     L(last_4x_vec_or_less)

> >  # endif

> > -

> > -L(more_4x_vec):

> > -     /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time

> > -        since data is only aligned to VEC_SIZE.  */

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > -     jnz     L(first_vec_x0)

> > -

> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> >       testl   %eax, %eax

> >       jnz     L(first_vec_x1)

> >

> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> >       testl   %eax, %eax

> >       jnz     L(first_vec_x2)

> >

> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> >       testl   %eax, %eax

> >       jnz     L(first_vec_x3)

> >

> > -     addq    $(VEC_SIZE * 4), %rdi

> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     testl   %eax, %eax

> > +     jnz     L(first_vec_x4)

> > +

> >

> >  # ifndef USE_AS_RAWMEMCHR

> > -     subq    $(VEC_SIZE * 4), %rdx

> > -     jbe     L(last_4x_vec_or_less)

> > -# endif

> > +     /* Check if at last CHAR_PER_VEC * 4 length.  */

> > +     subq    $(CHAR_PER_VEC * 4), %rdx

> > +     jbe     L(last_4x_vec_or_less_cmpeq)

> > +     addq    $VEC_SIZE, %rdi

> >

> > -     /* Align data to 4 * VEC_SIZE.  */

> > -     movq    %rdi, %rcx

> > -     andl    $(4 * VEC_SIZE - 1), %ecx

> > +     /* Align data to VEC_SIZE * 4 for the loop and

> > +        readjust length.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +#  ifdef USE_AS_WMEMCHR

> > +     movl    %edi, %ecx

> >       andq    $-(4 * VEC_SIZE), %rdi

> > -

> > -# ifndef USE_AS_RAWMEMCHR

> > -     /* Adjust length.  */

> > +     andl    $(VEC_SIZE * 4 - 1), %ecx

> > +     /* NB: Divide bytes by 4 to get the wchar_t

> > +        count.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     sarl    $2, %ecx

> >       addq    %rcx, %rdx

> > +#  else

> > +     addq    %rdi, %rdx

> > +     andq    $-(4 * VEC_SIZE), %rdi

> > +     subq    %rdi, %rdx

> > +#  endif

> > +# else

> > +     addq    $VEC_SIZE, %rdi

> > +     andq    $-(4 * VEC_SIZE), %rdi

> >  # endif

> >

> > +     vpxorq  %XZERO, %XZERO, %XZERO

> > +

> > +     /* Compare 4 * VEC at a time forward.  */

> >       .p2align 4

> >  L(loop_4x_vec):

> > -     /* Compare 4 * VEC at a time forward.  */

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k2

> > -     kord    %k1, %k2, %k5

> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3

> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4

> > -

> > -     kord    %k3, %k4, %k6

> > -     kortestd %k5, %k6

> > -     jnz     L(4x_vec_end)

> > -

> > -     addq    $(VEC_SIZE * 4), %rdi

> > -

> > +     /* It would be possible to save some instructions

> > +        using 4x VPCMP but bottleneck on port 5 makes it not woth

> > +        it.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1

> > +     /* xor will set bytes match esi to zero.  */

> > +     vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2

> > +     vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3

> > +     VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3

> > +     /* Reduce VEC2 / VEC3 with min and VEC1 with zero

> > +        mask.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     VPMINU  %YMM2, %YMM3, %YMM3 {%k1} {z}

> > +     VPCMP   $0, %YMM3, %YZERO, %k2

> >  # ifdef USE_AS_RAWMEMCHR

> > -     jmp     L(loop_4x_vec)

> > +     subq    $-(VEC_SIZE * 4), %rdi

> > +     kortestd %k2, %k3

> > +     jz      L(loop_4x_vec)

> >  # else

> > -     subq    $(VEC_SIZE * 4), %rdx

> > -     ja      L(loop_4x_vec)

> > +     kortestd %k2, %k3

> > +     jnz     L(loop_4x_vec_end)

> >

> > -L(last_4x_vec_or_less):

> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */

> > -     addl    $(VEC_SIZE * 2), %edx

> > -     jle     L(last_2x_vec)

> > +     subq    $-(VEC_SIZE * 4), %rdi

> >

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > -     jnz     L(first_vec_x0)

> > +     subq    $(CHAR_PER_VEC * 4), %rdx

> > +     ja      L(loop_4x_vec)

> >

> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > +     /* Fall through into less than 4 remaining

> > +        vectors of length case.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     addq    $(VEC_SIZE * 3), %rdi

> > +     .p2align 4

> > +L(last_4x_vec_or_less):

> > +     /* Check if first VEC contained match.  */

> >       testl   %eax, %eax

> > -     jnz     L(first_vec_x1)

> > +     jnz     L(first_vec_x1_check)

> >

> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > +     /* If remaining length > CHAR_PER_VEC * 2.  */

> > +     addl    $(CHAR_PER_VEC * 2), %edx

> > +     jg      L(last_4x_vec)

> >

> > -     jnz     L(first_vec_x2_check)

> > -     subl    $VEC_SIZE, %edx

> > -     jle     L(zero)

> > +L(last_2x_vec):

> > +     /* If remaining length < CHAR_PER_VEC.  */

> > +     addl    $CHAR_PER_VEC, %edx

> > +     jle     L(zero_end)

> > +

> > +     /* Check VEC2 and compare any match with

> > +        remaining length.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     tzcntl  %eax, %eax

> > +     cmpl    %eax, %edx

> > +     jbe     L(set_zero_end)

> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> > +L(zero_end):

> > +     ret

> >

> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> >

> > -     jnz     L(first_vec_x3_check)

> > +     .p2align 4

> > +L(first_vec_x1_check):

> > +     tzcntl  %eax, %eax

> > +     /* Adjust length.  */

> > +     subl    $-(CHAR_PER_VEC * 4), %edx

> > +     /* Check if match within remaining length.  */

> > +     cmpl    %eax, %edx

> > +     jbe     L(set_zero_end)

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> > +     ret

> > +L(set_zero_end):

> >       xorl    %eax, %eax

> >       ret

> >

> >       .p2align 4

> > -L(last_2x_vec):

> > -     addl    $(VEC_SIZE * 2), %edx

> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1

> > +L(loop_4x_vec_end):

> > +# endif

> > +     /* rawmemchr will fall through into this if match

> > +        was found in loop.  */

>

> Fit comments to 72 columns.


Fixed.

>

> > +

> > +     /* k1 has not of matches with VEC1.  */

> >       kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > +# ifdef USE_AS_WMEMCHR

> > +     subl    $((1 << CHAR_PER_VEC) - 1), %eax

> > +# else

> > +     incl    %eax

> > +# endif

> > +     jnz     L(last_vec_x1_return)

> >

> > -     jnz     L(first_vec_x0_check)

> > -     subl    $VEC_SIZE, %edx

> > -     jle     L(zero)

> > +     VPCMP   $0, %YMM2, %YZERO, %k0

> > +     kmovd   %k0, %eax

> > +     testl   %eax, %eax

> > +     jnz     L(last_vec_x2_return)

> >

> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1

> > -     kmovd   %k1, %eax

> > +     kmovd   %k2, %eax

> >       testl   %eax, %eax

> > -     jnz     L(first_vec_x1_check)

> > -     xorl    %eax, %eax

> > -     ret

> > +     jnz     L(last_vec_x3_return)

> >

> > -     .p2align 4

> > -L(first_vec_x0_check):

> > +     kmovd   %k3, %eax

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     sall    $2, %eax

> > +# ifdef USE_AS_RAWMEMCHR

> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

> > +# else

> > +     leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax

> >  # endif

> > -     /* Check the end of data.  */

> > -     cmpq    %rax, %rdx

> > -     jbe     L(zero)

> > -     addq    %rdi, %rax

> >       ret

> >

> >       .p2align 4

> > -L(first_vec_x1_check):

> > +L(last_vec_x1_return):

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     sall    $2, %eax

> > -# endif

> > -     /* Check the end of data.  */

> > -     cmpq    %rax, %rdx

> > -     jbe     L(zero)

> > -     addq    $VEC_SIZE, %rax

> > +# ifdef USE_AS_RAWMEMCHR

> > +#  ifdef USE_AS_WMEMCHR

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax

> > +#  else

> >       addq    %rdi, %rax

> > -     ret

> > -

> > -     .p2align 4

> > -L(first_vec_x2_check):

> > -     tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     sall    $2, %eax

> > +#  endif

> > +# else

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

> >  # endif

> > -     /* Check the end of data.  */

> > -     cmpq    %rax, %rdx

> > -     jbe     L(zero)

> > -     addq    $(VEC_SIZE * 2), %rax

> > -     addq    %rdi, %rax

> >       ret

> >

> >       .p2align 4

> > -L(first_vec_x3_check):

> > +L(last_vec_x2_return):

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     sall    $2, %eax

> > +# ifdef USE_AS_RAWMEMCHR

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax

> > +# else

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

> > +     leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax

> >  # endif

> > -     /* Check the end of data.  */

> > -     cmpq    %rax, %rdx

> > -     jbe     L(zero)

> > -     addq    $(VEC_SIZE * 3), %rax

> > -     addq    %rdi, %rax

> >       ret

> >

> >       .p2align 4

> > -L(zero):

> > -     xorl    %eax, %eax

> > -     ret

> > -# endif

> > -

> > -     .p2align 4

> > -L(first_vec_x0):

> > +L(last_vec_x3_return):

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     leaq    (%rdi, %rax, 4), %rax

> > +# ifdef USE_AS_RAWMEMCHR

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> >  # else

> > -     addq    %rdi, %rax

> > +     /* NB: Multiply bytes by CHAR_SIZE to get the

> > +        wchar_t count.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +     leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax

> >  # endif

> >       ret

> >

> > +

> > +# ifndef USE_AS_RAWMEMCHR

> > +L(last_4x_vec_or_less_cmpeq):

> > +     VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     subq    $-(VEC_SIZE * 4), %rdi

> > +     /* Check first VEC regardless.  */

> > +     testl   %eax, %eax

> > +     jnz     L(first_vec_x1_check)

> > +

> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */

> > +     addl    $(CHAR_PER_VEC * 2), %edx

> > +     jle     L(last_2x_vec)

> > +

> >       .p2align 4

> > -L(first_vec_x1):

> > +L(last_4x_vec):

> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     testl   %eax, %eax

> > +     jnz     L(last_vec_x2)

> > +

> > +

> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     /* Create mask for possible matches within

> > +        remaining length.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +#  ifdef USE_AS_WMEMCHR

> > +     movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx

> > +     bzhil   %edx, %ecx, %ecx

> > +#  else

> > +     movq    $-1, %rcx

> > +     bzhiq   %rdx, %rcx, %rcx

> > +#  endif

> > +     /* Test matches in data against length match.  */

> > +     andl    %ecx, %eax

> > +     jnz     L(last_vec_x3)

> > +

> > +     /* if remaining length <= CHAR_PER_VEC * 3 (Note

> > +        this is after remaining length was found to be >

> > +        CHAR_PER_VEC * 2.  */

>

> Fit comments to 72 columns.

Fixed.
>

> > +     subl    $CHAR_PER_VEC, %edx

> > +     jbe     L(zero_end2)

> > +

> > +

> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0

> > +     kmovd   %k0, %eax

> > +     /* Shift remaining length mask for last VEC.  */

> > +#  ifdef USE_AS_WMEMCHR

> > +     shrl    $CHAR_PER_VEC, %ecx

> > +#  else

> > +     shrq    $CHAR_PER_VEC, %rcx

> > +#  endif

> > +     andl    %ecx, %eax

> > +     jz      L(zero_end2)

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     leaq    VEC_SIZE(%rdi, %rax, 4), %rax

> > -# else

> > -     addq    $VEC_SIZE, %rax

> > -     addq    %rdi, %rax

> > -# endif

> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax

> > +L(zero_end2):

> >       ret

> >

> > -     .p2align 4

> > -L(first_vec_x2):

> > +L(last_vec_x2):

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax

> > -# else

> > -     addq    $(VEC_SIZE * 2), %rax

> > -     addq    %rdi, %rax

> > -# endif

> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax

> >       ret

> >

> >       .p2align 4

> > -L(4x_vec_end):

> > -     kmovd   %k1, %eax

> > -     testl   %eax, %eax

> > -     jnz     L(first_vec_x0)

> > -     kmovd   %k2, %eax

> > -     testl   %eax, %eax

> > -     jnz     L(first_vec_x1)

> > -     kmovd   %k3, %eax

> > -     testl   %eax, %eax

> > -     jnz     L(first_vec_x2)

> > -     kmovd   %k4, %eax

> > -     testl   %eax, %eax

> > -L(first_vec_x3):

> > +L(last_vec_x3):

> >       tzcntl  %eax, %eax

> > -# ifdef USE_AS_WMEMCHR

> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */

> > -     leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax

> > -# else

> > -     addq    $(VEC_SIZE * 3), %rax

> > -     addq    %rdi, %rax

> > -# endif

> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

> >       ret

> > +# endif

> >

> > -END (MEMCHR)

> > +END(MEMCHR)

>

> No need for this change.Fixed.

>

> >  #endif

> > --

> > 2.29.2

> >

>

> Thanks.

>

> H.J.

Patch

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..65c16ef8a4 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@ 
 
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 # endif
 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+#define XZERO		xmm23
+#define YZERO		ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,18 +58,16 @@ 
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
 
 	.section .text.evex,"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
-# endif
-	movl	%edi, %ecx
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
+
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -63,319 +75,425 @@  ENTRY (MEMCHR)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Check if we may cross page boundary with one
+	   vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	ret
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close
+	   enough to ideal alignment. So only realign
+	   L(cross_page_boundary) if rawmemchr.  */
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original
+	   value is necessary for computer return address if byte is
+	   found or adjusting length if it is not and this is
+	   memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
+	   for memchr and rdi for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	/* NB: Divide shift count by 4 since each bit in
+	   K0 represent 4 bytes.  */
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
 # endif
-	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
 	ret
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one
+	   VEC_SIZE at a time since data is only aligned to
+	   VEC_SIZE.  */
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the
+	   end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.
+	 */
+	subq	%rsi, %rdx
 	jbe	L(last_4x_vec_or_less)
 # endif
-
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
 
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and
+	   readjust length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %ecx
 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	/* It would be possible to save some instructions
+	   using 4x VPCMP but bottleneck on port 5 makes it not woth
+	   it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero
+	   mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	/* Fall through into less than 4 remaining
+	   vectors of length case.  */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+
+	/* Check VEC2 and compare any match with
+	   remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match
+	   was found in loop.  */
+
+	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	ret
+	jnz	L(last_vec_x3_return)
 
-	.p2align 4
-L(first_vec_x0_check):
+	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
 	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(first_vec_x2_check):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
 	.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within
+	   remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note
+	   this is after remaining length was found to be >
+	   CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
 	ret
 
-	.p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
-L(4x_vec_end):
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
+# endif
 
-END (MEMCHR)
+END(MEMCHR)
 #endif