Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
Spack
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
eic_tools
Spack
Commits
0ed18de8
Commit
0ed18de8
authored
8 years ago
by
Denis Davydov
Committed by
Adam J. Stewart
8 years ago
Browse files
Options
Downloads
Patches
Plain Diff
opneblas: fix compilation with clang (#3862)
* opneblas: fix compilation with clang * indentation
parent
3331c042
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
var/spack/repos/builtin/packages/openblas/openblas0.2.19.diff
+847
-0
847 additions, 0 deletions
...spack/repos/builtin/packages/openblas/openblas0.2.19.diff
var/spack/repos/builtin/packages/openblas/package.py
+4
-6
4 additions, 6 deletions
var/spack/repos/builtin/packages/openblas/package.py
with
851 additions
and
6 deletions
var/spack/repos/builtin/packages/openblas/openblas0.2.19.diff
0 → 100644
+
847
−
0
View file @
0ed18de8
diff --git a/cpuid_x86.c b/cpuid_x86.c
index bbd377f..dff1507 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1110,6 +1110,9 @@
int get_cpuname(void){
break;
case 3:
switch (model) {
+ case 7:
+ // Bay Trail
+ return CPUTYPE_ATOM;
case 10:
case 14:
// Ivy Bridge
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 18f85c3..a09660f 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -232,6 +232,7 @@
static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
+ if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S
index a52bb07..926395c 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S
@@ -277,7 +277,7 @@
LEAQ (, %rax, SIZE), %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
PREFETCH2 0*SIZE(prebb);
XOR_DY yvec15, yvec15, yvec15;
PREFETCH2 8*SIZE(prebb);
@@ -317,7 +317,7 @@
ALIGN_5;
.L2_bodyB:;
# Computing kernel
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -345,7 +345,7 @@
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+//#### Unroll times 2 ####
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -373,7 +373,7 @@
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 3 ####
+//#### Unroll times 3 ####
LD_DY 20*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -402,7 +402,7 @@
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 4 ####
+//#### Unroll times 4 ####
LD_DY 28*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -446,7 +446,7 @@
TEST $2, %rax;
JLE .L3_loopE;
ALIGN_5
.L3_bodyB:
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
@@ -475,7 +475,7 @@
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
-#### Unroll times 2 ####
+//#### Unroll times 2 ####
PREFETCH0 72*SIZE(ptrba)
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
@@ -516,7 +516,7 @@
TEST $1, %rax;
JLE .L4_loopE;
ALIGN_5
.L4_bodyB:;
-#### Unroll times 1 ####
+//#### Unroll times 1 ####
PREFETCH0 64*SIZE(ptrba)
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
@@ -544,9 +544,9 @@
ADD_DY yvec10, yvec6, yvec10;
ADD_DY yvec8, yvec7, yvec8;
.L4_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA,yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec7,yvec15,yvec15;
MUL_DY yvec7,yvec14,yvec14;
MUL_DY yvec7,yvec13,yvec13;
@@ -555,7 +555,7 @@
MUL_DY yvec7,yvec11,yvec11;
MUL_DY yvec7,yvec10,yvec10;
MUL_DY yvec7,yvec9,yvec9;
MUL_DY yvec7,yvec8,yvec8;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DY yvec15,yvec7;
REVS_DY $0x0a,yvec13,yvec15,yvec15;
REVS_DY $0x0a,yvec7,yvec13,yvec13;
@@ -568,13 +568,13 @@
REVS_DY $0x0a,yvec7,yvec9,yvec9;
MOV_DY yvec10,yvec7;
REVS_DY $0x0a,yvec8,yvec10,yvec10;
REVS_DY $0x0a,yvec7,yvec8,yvec8;
-#### Testing alignment ####
+//#### Testing alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L4_loopEx; # Unalign part write back
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec14,xvec6;
EXTRA_DY $1,yvec13,xvec5;
@@ -776,7 +776,7 @@
LEAQ (, %rax, SIZE), %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec13, yvec13, yvec13;
LD_DY 0*SIZE(ptrbb), yvec2;
@@ -805,7 +805,7 @@
ALIGN_5;
.L6_bodyB:;
# Computing kernel
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -821,7 +821,7 @@
VPERMILP_DY $0x05, yvec2, yvec3;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 2 ####
+//#### Untoll time 2 ####
LD_DY 8*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -837,7 +837,7 @@
VPERMILP_DY $0x05, yvec2, yvec3;
MUL_DY yvec1, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 3 ####
+//#### Untoll time 3 ####
LD_DY 12*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -855,7 +855,7 @@
ADDQ $16*SIZE, ptrbb;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 4 ####
+//#### Untoll time 4 ####
LD_DY 0*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -883,7 +883,7 @@
TEST $2, %rax;
JLE .L7_loopE;
ALIGN_5
.L7_bodyB:;
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
LD_DY 4*SIZE(ptrba), yvec1;
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -901,7 +901,7 @@
ADDQ $8*SIZE, ptrbb;
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
-#### Untoll time 2 ####
+//#### Untoll time 2 ####
LD_DY 0*SIZE(ptrba), yvec0;
MUL_DY yvec1, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
@@ -927,7 +927,7 @@
TEST $1, %rax;
JLE .L8_loopE;
ALIGN_5
.L8_bodyB:;
-#### Untoll time 1 ####
+//#### Untoll time 1 ####
MUL_DY yvec0, yvec2, yvec6;
ADD_DY yvec15, yvec6, yvec15;
SHUF_DY $0x03, yvec2, yvec2, yvec4;
@@ -943,27 +943,27 @@
MUL_DY yvec0, yvec5, yvec7;
ADD_DY yvec9, yvec7, yvec9;
.L8_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA, yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec7,yvec15,yvec15;
MUL_DY yvec7,yvec13,yvec13;
MUL_DY yvec7,yvec11,yvec11;
MUL_DY yvec7,yvec9,yvec9;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DY yvec15, yvec7;
REVS_DY $0x0a,yvec13,yvec15,yvec15;
REVS_DY $0x0a,yvec7,yvec13,yvec13;
MOV_DY yvec11,yvec7;
REVS_DY $0x0a,yvec9,yvec11,yvec11;
REVS_DY $0x0a,yvec7,yvec9,yvec9;
-#### Testing alignment ####
+//#### Testing alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L8_loopEx; # Unalign part write back
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1,yvec15,xvec7;
EXTRA_DY $1,yvec13,xvec5;
EXTRA_DY $1,yvec11,xvec3;
@@ -1076,7 +1076,7 @@
LEAQ (, %rax, SIZE), %rax;
LEAQ (ptrba, %rax, 2), ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
LD_DX 0*SIZE(ptrbb), xvec2;
XOR_DY yvec15, yvec15, yvec15;
LD_DX 2*SIZE(ptrbb), xvec3;
@@ -1106,7 +1106,7 @@
ALIGN_5;
.L10_bodyB:;
# Computing kernel
-##### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
@@ -1123,7 +1123,7 @@
SHUF_DX $0x4e, xvec6, xvec4;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
@@ -1140,7 +1140,7 @@
SHUF_DX $0x4e, xvec2, xvec4;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-##### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 12*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
@@ -1159,7 +1159,7 @@
ADDQ $8*SIZE, ptrba;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 0*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
@@ -1188,7 +1188,7 @@
TEST $2, %rax;
JLE .L11_loopE;
ALIGN_5
.L11_bodyB:;
-##### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 4*SIZE(ptrbb), xvec6;
SHUF_DX $0x4e, xvec3, xvec5;
MUL_DX xvec0, xvec2, xvec2;
@@ -1208,7 +1208,7 @@
ADDQ $4*SIZE, ptrba;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 0*SIZE(ptrbb), xvec2;
SHUF_DX $0x4e, xvec7, xvec5;
MUL_DX xvec1, xvec6, xvec6;
@@ -1251,27 +1251,27 @@
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec9, xvec9;
.L12_loopE:;
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DX MEMALPHA, xvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec13, xvec13;
MUL_DX xvec7, xvec11, xvec11;
MUL_DX xvec7, xvec9, xvec9;
-#### Reverse the Results ####
+//#### Reverse the Results ####
MOV_DX xvec15, xvec6;
REVS_DX xvec13, xvec15, xvec15;
REVS_DX xvec6, xvec13, xvec13;
MOV_DX xvec11, xvec6;
REVS_DX xvec9, xvec11, xvec11;
REVS_DX xvec6, xvec9, xvec9;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L12_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec13, xvec13;
ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15;
@@ -1345,7 +1345,7 @@
LEAQ (,%rax, SIZE), %rax;
ADDQ %rax, ptrba;
LEAQ (ptrbb, %rax, 4), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
@@ -1429,11 +1429,11 @@
ADDQ $1*SIZE, ptrba;
ADDQ $4*SIZE, ptrbb;
.L16_loopE:
-#### Load Alpha ####
+//#### Load Alpha ####
BROAD_DY MEMALPHA, yvec7;
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
MUL_DY yvec15, yvec7, yvec15;
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec7;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec0, xvec0;
@@ -1497,7 +1497,7 @@
LEAQ (, %rax, SIZE), %rax;
LEAQ (ptrba, %rax, 8), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
XOR_DY yvec13, yvec13, yvec13;
@@ -1526,7 +1526,7 @@
JLE .L211_loopE;
ALIGN_5;
.L211_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1563,7 +1563,7 @@
ADD_DX xvec6, xvec9, xvec9;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1600,7 +1600,7 @@
ADD_DX xvec6, xvec9, xvec9;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 16*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1637,7 +1637,7 @@
ADD_DX xvec6, xvec9, xvec9;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 24*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1689,7 +1689,7 @@
JLE .L212_loopE;
ALIGN_5;
.L212_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1726,7 +1726,7 @@
ADD_DX xvec6, xvec9, xvec9;
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1775,7 +1775,7 @@
TEST $1, %rax;
JLE .L213_loopE;
ALIGN_5
.L213_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1815,7 +1815,7 @@
MUL_DX xvec3, xvec7, xvec7;
ADD_DX xvec7, xvec8, xvec8;
.L213_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec14, xvec14;
@@ -1825,7 +1825,7 @@
MUL_DX xvec7, xvec11, xvec11;
MUL_DX xvec7, xvec10, xvec10;
MUL_DX xvec7, xvec9, xvec9;
MUL_DX xvec7, xvec8, xvec8;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
@@ -1838,13 +1838,13 @@
REVS_DX xvec6, xvec9, xvec9;
MOV_DX xvec12, xvec6;
REVS_DX xvec8, xvec12, xvec12;
REVS_DX xvec6, xvec8, xvec8;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L213_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 2*SIZE(C0), xvec10, xvec10;
@@ -1952,7 +1952,7 @@
LEAQ (,%rax, SIZE), %rax;
LEAQ (ptrba, %rax, 4), ptrba;
LEAQ (ptrbb, %rax, 2), ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
XOR_DY yvec11, yvec11, yvec11;
@@ -1977,7 +1977,7 @@
JLE .L221_loopE;
ALIGN_5
.L221_bodyB:;
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -1996,7 +1996,7 @@
ADD_DX xvec4, xvec11, xvec11;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2015,7 +2015,7 @@
ADD_DX xvec4, xvec11, xvec11;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 8*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2034,7 +2034,7 @@
ADD_DX xvec4, xvec11, xvec11;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 12*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2067,7 +2067,7 @@
TEST $2, %rax;
JLE .L222_loopE;
ALIGN_5
.L222_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2086,7 +2086,7 @@
ADD_DX xvec4, xvec11, xvec11;
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2116,7 +2116,7 @@
TEST $1, %rax;
JLE .L223_loopE;
ALIGN_5
.L223_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
MOV_DX xvec4, xvec5;
@@ -2138,26 +2138,26 @@
MUL_DX xvec1, xvec5, xvec5;
ADD_DX xvec5, xvec10, xvec10;
.L223_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec14, xvec14;
MUL_DX xvec7, xvec11, xvec11;
MUL_DX xvec7, xvec10, xvec10;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
MOV_DX xvec14, xvec6;
REVS_DX xvec10, xvec14, xvec14;
REVS_DX xvec6, xvec10, xvec10;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L223_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 2*SIZE(C0), xvec10, xvec10;
@@ -2220,7 +2220,7 @@
ADDQ $4, kk
ADDQ $4*SIZE, C0;
ADDQ $4*SIZE, C1;
.L22_loopE:;
-TEST $2, bm; # Rm = 2
+TEST $2, bm; // Rm = 2
JLE .L23_loopE;
ALIGN_5;
.L23_bodyB:
@@ -2255,7 +2255,7 @@
JLE .L231_loopE;
ALIGN_5
.L231_bodyB:
# Computing kernel
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2264,7 +2264,7 @@
ADD_DX xvec4, xvec15, xvec15;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 2*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2273,7 +2273,7 @@
ADD_DX xvec4, xvec15, xvec15;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DX 4*SIZE(ptrba), xvec0;
LD_DX 4*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2282,7 +2282,7 @@
ADD_DX xvec4, xvec15, xvec15;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 4 ####
+//#### Unroll time 4 ####
LD_DX 6*SIZE(ptrba), xvec0;
LD_DX 6*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2305,7 +2305,7 @@
TEST $2, %rax;
JLE .L232_loopE;
ALIGN_5
.L232_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2314,7 +2314,7 @@
ADD_DX xvec4, xvec15, xvec15;
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DX 2*SIZE(ptrba), xvec0;
LD_DX 2*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2334,7 +2334,7 @@
TEST $1, %rax;
JLE .L233_loopE;
ALIGN_5
.L233_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DX 0*SIZE(ptrba), xvec0;
LD_DX 0*SIZE(ptrbb), xvec4;
SHUF_DX $0x4e, xvec4, xvec5;
@@ -2345,21 +2345,21 @@
MUL_DX xvec0, xvec5, xvec5;
ADD_DX xvec5, xvec11, xvec11;
ADDQ $2*SIZE, ptrbb;
.L233_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
MUL_DX xvec7, xvec11, xvec11;
-#### Reverse #####
+//#### Reverse ####
MOV_DX xvec15, xvec6;
REVS_DX xvec11, xvec15, xvec15;
REVS_DX xvec6, xvec11, xvec11;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L233_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec11, xvec11;
ADD_DX 0*SIZE(C1), xvec15, xvec15;
@@ -2408,7 +2408,7 @@
ADDQ $2, kk;
ADDQ $2*SIZE, C0;
ADDQ $2*SIZE, C1;
.L23_loopE:
-TEST $1, bm; # Rm = 1
+TEST $1, bm; // Rm = 1
JLE .L24_loopE;
ALIGN_5;
.L24_bodyB:
@@ -2534,7 +2534,7 @@
SALQ $4, k;
ADDQ k, bb;
LEAQ (C, ldc, 2), C;
.L20_loopE:;
-TEST $1, bn; # Rn = 1
+TEST $1, bn; // Rn = 1
JLE .L30_loopE;
ALIGN_5
.L30_bodyB:
@@ -2558,7 +2558,7 @@
LEAQ (, %rax, SIZE), %rax;
LEAQ (ptrba, %rax, 8), ptrba;
ADDQ %rax, ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
XOR_DY yvec14, yvec14, yvec14;
#ifndef TRMMKERNEL
@@ -2580,7 +2580,7 @@
SARQ $2, k;
JLE .L311_loopE;
ALIGN_5
.L311_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@@ -2589,7 +2589,7 @@
ADD_DY yvec0, yvec15, yvec15;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 8*SIZE(ptrba), yvec3;
LD_DY 12*SIZE(ptrba), yvec4;
BROAD_DY 1*SIZE(ptrbb), yvec5;
@@ -2598,7 +2598,7 @@
ADD_DY yvec3, yvec15, yvec15;
MUL_DY yvec5, yvec4, yvec4
ADD_DY yvec4, yvec14, yvec14;
-#### Unroll time 3 ####
+//#### Unroll time 3 ####
LD_DY 16*SIZE(ptrba), yvec0;
LD_DY 20*SIZE(ptrba), yvec1;
BROAD_DY 2*SIZE(ptrbb), yvec2;
@@ -2607,7 +2607,7 @@
ADD_DY yvec0, yvec15, yvec15;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 24*SIZE(ptrba), yvec3;
LD_DY 28*SIZE(ptrba), yvec4;
BROAD_DY 3*SIZE(ptrbb), yvec5;
@@ -2630,7 +2630,7 @@
TEST $2, %rax;
JLE .L312_loopE;
ALIGN_5
.L312_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@@ -2639,7 +2639,7 @@
ADD_DY yvec0, yvec15, yvec15;
MUL_DY yvec2, yvec1, yvec1;
ADD_DY yvec1, yvec14, yvec14;
-#### Unroll time 2 ####
+//#### Unroll time 2 ####
LD_DY 8*SIZE(ptrba), yvec3;
LD_DY 12*SIZE(ptrba), yvec4;
BROAD_DY 1*SIZE(ptrbb), yvec5;
@@ -2660,7 +2660,7 @@
TEST $1, %rax;
JLE .L313_loopE;
ALIGN_5
.L313_bodyB:
-#### Unroll time 1 ####
+//#### Unroll time 1 ####
LD_DY 0*SIZE(ptrba), yvec0;
LD_DY 4*SIZE(ptrba), yvec1;
BROAD_DY 0*SIZE(ptrbb), yvec2;
@@ -2672,17 +2672,17 @@
ADD_DY yvec1, yvec14, yvec14;
ADDQ $1*SIZE, ptrbb;
.L313_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DY MEMALPHA, yvec7;
MUL_DY yvec7, yvec15, yvec15;
MUL_DY yvec7, yvec14, yvec14;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L313_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec13;
EXTRA_DY $1, yvec14, xvec12;
#ifndef TRMMKERNEL
@@ -2762,7 +2762,7 @@
LEAQ (,%rax, SIZE), %rax;
LEAQ (ptrba, %rax, 4), ptrba;
ADDQ %rax, ptrbb;
#endif
-#### Initial Results Register ####
+//#### Initial Results Register ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
@@ -2847,16 +2847,16 @@
ADDQ $4*SIZE, ptrba;
ADDQ $1*SIZE, ptrbb;
.L323_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DY MEMALPHA, yvec7;
MUL_DY yvec7, yvec15, yvec15;
-#### Testing Alignment ####
+//#### Testing Alignment ####
MOVQ C0, %rax;
OR ldc, %rax;
TEST $15, %rax;
JNE .L323_loopEx;
ALIGN_5
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
#ifndef TRMMKERNEL
ADD_DX 0*SIZE(C0), xvec15, xvec15;
@@ -2878,7 +2878,7 @@
ADDQ $4*SIZE, C0;
JMP .L32_loopE;
ALIGN_5
.L323_loopEx:
-#### Writing Back ####
+//#### Writing Back ####
EXTRA_DY $1, yvec15, xvec14;
#ifndef TRMMKERNEL
LDL_DX 0*SIZE(C0), xvec13, xvec13;
@@ -2917,7 +2917,7 @@
LEAQ (, %rax, SIZE), %rax
LEAQ (ptrba, %rax, 2), ptrba
ADDQ %rax, ptrbb;
#endif
-#### Initial Result ####
+//#### Initial Result ####
XOR_DY yvec15, yvec15, yvec15;
#ifndef TRMMKERNEL
MOVQ bk, k;
@@ -3000,7 +3000,7 @@
ADD_DX xvec2, xvec15, xvec15;
ADDQ $2*SIZE, ptrba;
ADDQ $1*SIZE, ptrbb;
.L333_loopE:
-#### Multiply Alpha ####
+//#### Multiply Alpha ####
BROAD_DX MEMALPHA, xvec7;
MUL_DX xvec7, xvec15, xvec15;
#ifndef TRMMKERNEL
@@ -3119,7 +3119,7 @@
addq $1*SIZE, ptrba;
addq $1*SIZE, ptrbb;
.L343_loopE:
-#### Writing Back ####
+//#### Writing Back ####
vmovsd MEMALPHA, xvec7;
vmulsd xvec7, xvec15, xvec15;
#ifndef TRMMKERNEL
This diff is collapsed.
Click to expand it.
var/spack/repos/builtin/packages/openblas/package.py
+
4
−
6
View file @
0ed18de8
...
...
@@ -57,6 +57,10 @@ class Openblas(MakefilePackage):
# https://github.com/xianyi/OpenBLAS/pull/915
patch
(
'
openblas_icc.patch
'
,
when
=
'
%intel
'
)
# Change file comments to work around clang 3.9 assembler bug
# https://github.com/xianyi/OpenBLAS/pull/982
patch
(
'
openblas0.2.19.diff
'
,
when
=
'
@0.2.19
'
)
parallel
=
False
conflicts
(
'
%intel@16
'
,
when
=
'
@0.2.15:0.2.19
'
)
...
...
@@ -78,12 +82,6 @@ def check_compilers(self):
'
OpenBLAS does not support OpenMP with clang!
'
)
spec
=
self
.
spec
if
spec
.
satisfies
(
'
%clang@8.1.0:
'
)
and
spec
.
satisfies
(
'
@:0.2.19
'
):
raise
InstallError
(
'
OpenBLAS @:0.2.19 does not build with Apple clang@8.1.0:
'
)
@property
def
make_defs
(
self
):
# Configure fails to pick up fortran from FC=/abs/path/to/f77, but
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment