Commit 10cdc3fc authored by Santiago Marco-Sola's avatar Santiago Marco-Sola
Browse files

Removed unused AVX code

parent b34bb43b
......@@ -37,7 +37,7 @@ BWAMEM2_PATH=./bwa-mem2
CXXFLAGS=-std=c++11 -fopenmp
CPPFLAGS=-DENABLE_PREFETCH -DBWA_OTHER_ELE=0
INCLUDES=-I$(BWAMEM2_PATH)/src -I$(BWAMEM2_PATH)/ext/safestringlib/include
INCLUDES=-I$(BWAMEM2_PATH)/src -I$(BWAMEM2_PATH)/ext/safestringlib/include -I/usr/include/
LIBS=-L$(BWAMEM2_PATH) -L$(BWAMEM2_PATH)/ext/safestringlib -lsafestring -fopenmp -lz -lbwa -ldl
.PHONY:all clean depend
......
......@@ -35,7 +35,7 @@ EXE=bwa-mem2
MEM_FLAGS=-DSAIS=1
CPPFLAGS+=-DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS)
INCLUDES=-Isrc -I./ext/safestringlib/include
INCLUDES=-Isrc -I./ext/safestringlib/include -I/usr/include/
LIBS=-lpthread -lm -lz -L. -lbwa -L./ext/safestringlib -lsafestring
OBJS=src/fastmap.o src/bwtindex.o src/utils.o src/memcpy_bwamem.o src/kthread.o \
src/kstring.o src/ksw.o src/bntseq.o src/bwamem.o src/profiling.o \
......
......@@ -5,7 +5,7 @@ CXX=$(EPI_TOOLCHAIN)/bin/clang++
IDIR = include
MKDIR_P = mkdir -p
CFLAGS=-I$(IDIR) -fPIE -fPIC -O2 -D_FORTIFY_SOURCE=2 -Wformat -Wformat-security
LDFLAGS=-z noexecstack -z relo -z now
#LDFLAGS=-z noexecstack -z relo -z now
ODIR=obj
OTDIR=objtest
......
......@@ -41,6 +41,14 @@ extern "C" {
}
#endif
/*
* Compatibility MACROS
*/
#define _mm_malloc(size,align) malloc(size)
#define _mm_free(addr) free(addr)
#define __rdtsc() clock()
#define _mm_prefetch(addr,mode) __builtin_prefetch(addr)
FMI_search::FMI_search(const char *fname)
{
fprintf(stderr, "* Entering FMI_search\n");
......
......@@ -29,9 +29,13 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
#include "omp.h"
#include "bandedSWA.h"
#ifdef VTUNE_ANALYSIS
#include <ittnotify.h>
#endif
/*
* Compatibility MACROS
*/
#define _mm_malloc(size,align) malloc(size)
#define _mm_free(addr) free(addr)
#define __rdtsc() clock()
#if defined(__clang__) || defined(__GNUC__)
#define __mmask8 uint8_t
......@@ -39,6 +43,7 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
#define __mmask32 uint32_t
#endif
// ------------------------------------------------------------------------------------
// MACROs for vector code
extern uint64_t prof[10][112];
......@@ -271,4596 +276,3 @@ void BandedPairWiseSW::scalarBandedSWAWrapper(SeqPair *seqPairArray,
}
#if ((!__AVX512BW__) & (__AVX2__))
//------------------------------------------------------------------------------
// MACROs
// ------------------------ vec-8 ---------------------------------------------
#define ZSCORE8(i4_256, y4_256) \
{ \
__m256i tmpi = _mm256_sub_epi8(i4_256, x256); \
__m256i tmpj = _mm256_sub_epi8(y4_256, y256); \
cmp = _mm256_cmpgt_epi8(tmpi, tmpj); \
score256 = _mm256_sub_epi8(maxScore256, maxRS1); \
__m256i insdel = _mm256_blendv_epi8(e_ins256, e_del256, cmp); \
__m256i sub_a256 = _mm256_sub_epi8(tmpi, tmpj); \
__m256i sub_b256 = _mm256_sub_epi8(tmpj, tmpi); \
tmp = _mm256_blendv_epi8(sub_b256, sub_a256, cmp); \
tmp = _mm256_sub_epi8(score256, tmp); \
cmp = _mm256_cmpgt_epi8(tmp, zdrop256); \
exit0 = _mm256_blendv_epi8(exit0, zero256, cmp); \
}
#define MAIN_CODE8(s1, s2, h00, h11, e11, f11, f21, zero256, maxScore256, e_ins256, oe_ins256, e_del256, oe_del256, y256, maxRS) \
{ \
__m256i cmp11 = _mm256_cmpeq_epi8(s1, s2); \
__m256i sbt11 = _mm256_blendv_epi8(mismatch256, match256, cmp11); \
__m256i tmp256 = _mm256_max_epu8(s1, s2); \
/*tmp256 = _mm256_cmpeq_epi8(tmp256, val102);*/ \
sbt11 = _mm256_blendv_epi8(sbt11, w_ambig_256, tmp256); \
__m256i m11 = _mm256_add_epi8(h00, sbt11); \
cmp11 = _mm256_cmpeq_epi8(h00, zero256); \
m11 = _mm256_blendv_epi8(m11, zero256, cmp11); \
h11 = _mm256_max_epi8(m11, e11); \
h11 = _mm256_max_epi8(h11, f11); \
__m256i temp256 = _mm256_sub_epi8(m11, oe_ins256); \
__m256i val256 = _mm256_max_epi8(temp256, zero256); \
e11 = _mm256_sub_epi8(e11, e_ins256); \
e11 = _mm256_max_epi8(val256, e11); \
temp256 = _mm256_sub_epi8(m11, oe_del256); \
val256 = _mm256_max_epi8(temp256, zero256); \
f21 = _mm256_sub_epi8(f11, e_del256); \
f21 = _mm256_max_epi8(val256, f21); \
}
// ------------------------ vec 16 --------------------------------------------------
#define _mm256_blendv_epi16(a,b,c) \
_mm256_blendv_epi8(a, b, c);
#define ZSCORE16(i4_256, y4_256) \
{ \
__m256i tmpi = _mm256_sub_epi16(i4_256, x256); \
__m256i tmpj = _mm256_sub_epi16(y4_256, y256); \
cmp = _mm256_cmpgt_epi16(tmpi, tmpj); \
score256 = _mm256_sub_epi16(maxScore256, maxRS1); \
__m256i insdel = _mm256_blendv_epi16(e_ins256, e_del256, cmp); \
__m256i sub_a256 = _mm256_sub_epi16(tmpi, tmpj); \
__m256i sub_b256 = _mm256_sub_epi16(tmpj, tmpi); \
tmp = _mm256_blendv_epi16(sub_b256, sub_a256, cmp); \
tmp = _mm256_sub_epi16(score256, tmp); \
cmp = _mm256_cmpgt_epi16(tmp, zdrop256); \
exit0 = _mm256_blendv_epi16(exit0, zero256, cmp); \
}
#define MAIN_CODE16(s1, s2, h00, h11, e11, f11, f21, zero256, maxScore256, e_ins256, oe_ins256, e_del256, oe_del256, y256, maxRS) \
{ \
__m256i cmp11 = _mm256_cmpeq_epi16(s1, s2); \
__m256i sbt11 = _mm256_blendv_epi16(mismatch256, match256, cmp11); \
__m256i tmp256 = _mm256_max_epu16(s1, s2); \
sbt11 = _mm256_blendv_epi16(sbt11, w_ambig_256, tmp256); \
__m256i m11 = _mm256_add_epi16(h00, sbt11); \
cmp11 = _mm256_cmpeq_epi16(h00, zero256); \
m11 = _mm256_blendv_epi16(m11, zero256, cmp11); \
h11 = _mm256_max_epi16(m11, e11); \
h11 = _mm256_max_epi16(h11, f11); \
__m256i temp256 = _mm256_sub_epi16(m11, oe_ins256); \
__m256i val256 = _mm256_max_epi16(temp256, zero256); \
e11 = _mm256_sub_epi16(e11, e_ins256); \
e11 = _mm256_max_epi16(val256, e11); \
temp256 = _mm256_sub_epi16(m11, oe_del256); \
val256 = _mm256_max_epi16(temp256, zero256); \
f21 = _mm256_sub_epi16(f11, e_del256); \
f21 = _mm256_max_epi16(val256, f21); \
}
// MACROs section ends
// ------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------
// B-SWA - Vector code
// ------------------------- AVX2 - 8 bit SIMD_LANES ---------------------------
inline void sortPairsLen(SeqPair *pairArray, int32_t count, SeqPair *tempArray,
int16_t *hist)
{
int32_t i;
__m256i zero256 = _mm256_setzero_si256();
for(i = 0; i <= MAX_SEQ_LEN16; i+=16)
_mm256_store_si256((__m256i *)(hist + i), zero256);
for(i = 0; i < count; i++)
{
SeqPair sp = pairArray[i];
hist[sp.len1]++;
}
int32_t cumulSum = 0;
for(i = 0; i <= MAX_SEQ_LEN16; i++)
{
int32_t cur = hist[i];
hist[i] = cumulSum;
// histb[i] = cumulSum;
cumulSum += cur;
}
for(i = 0; i < count; i++)
{
SeqPair sp = pairArray[i];
int32_t pos = hist[sp.len1];
tempArray[pos] = sp;
hist[sp.len1]++;
}
for(i = 0; i < count; i++) {
pairArray[i] = tempArray[i];
}
}
inline void sortPairsId(SeqPair *pairArray, int32_t first, int32_t count,
SeqPair *tempArray)
{
int32_t i;
for(i = 0; i < count; i++)
{
SeqPair sp = pairArray[i];
int32_t pos = sp.id - first;
tempArray[pos] = sp;
}
for(i = 0; i < count; i++)
pairArray[i] = tempArray[i];
}
/******************* Vector code, version 2.0 *************************/
#define PFD 2
void BandedPairWiseSW::getScores8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w)
{
int64_t startTick, endTick;
smithWatermanBatchWrapper8(pairArray, seqBufRef, seqBufQer, numPairs, numThreads, w);
#if MAXI
printf("AVX2 Vecor code: Writing output..\n");
for (int l=0; l<numPairs; l++)
{
fprintf(stderr, "%d (%d %d) %d %d %d\n",
pairArray[l].score, pairArray[l].tle, pairArray[l].qle,
pairArray[l].gscore, pairArray[l].max_off, pairArray[l].gtle);
}
printf("Vector code: Writing output completed!!!\n\n");
#endif
}
void BandedPairWiseSW::smithWatermanBatchWrapper8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w)
{
int64_t st1, st2, st3, st4, st5;
#if RDT
st1 = __rdtsc();
#endif
uint8_t *seq1SoA = NULL;
seq1SoA = (uint8_t *)_mm_malloc(MAX_SEQ_LEN8 * SIMD_WIDTH8 * numThreads * sizeof(uint8_t), 64);
uint8_t *seq2SoA = NULL;
seq2SoA = (uint8_t *)_mm_malloc(MAX_SEQ_LEN8 * SIMD_WIDTH8 * numThreads * sizeof(uint8_t), 64);
if (seq1SoA == NULL || seq2SoA == NULL) {
fprintf(stderr, "Error! Mem not allocated!!!\n");
exit(EXIT_FAILURE);
}
int32_t ii;
int32_t roundNumPairs = ((numPairs + SIMD_WIDTH8 - 1)/SIMD_WIDTH8 ) * SIMD_WIDTH8;
// assert(roundNumPairs < BATCH_SIZE * SEEDS_PER_READ);
for(ii = numPairs; ii < roundNumPairs; ii++)
{
pairArray[ii].id = ii;
pairArray[ii].len1 = 0;
pairArray[ii].len2 = 0;
}
#if RDT
st2 = __rdtsc();
#endif
#if SORT_PAIRS // disbaled in bwa-mem2 (only used in separate benchmark bsw code)
// Sort the sequences according to decreasing order of lengths
SeqPair *tempArray = (SeqPair *)_mm_malloc(SORT_BLOCK_SIZE * numThreads *
sizeof(SeqPair), 64);
int16_t *hist = (int16_t *)_mm_malloc((MAX_SEQ_LEN8 + 32) * numThreads *
sizeof(int16_t), 64);
#pragma omp parallel num_threads(numThreads)
{
int32_t tid = omp_get_thread_num();
SeqPair *myTempArray = tempArray + tid * SORT_BLOCK_SIZE;
int16_t *myHist = hist + tid * (MAX_SEQ_LEN8 + 32);
#pragma omp for
for(ii = 0; ii < roundNumPairs; ii+=SORT_BLOCK_SIZE)
{
int32_t first, last;
first = ii;
last = ii + SORT_BLOCK_SIZE;
if(last > roundNumPairs) last = roundNumPairs;
sortPairsLen(pairArray + first, last - first, myTempArray, myHist);
}
}
_mm_free(hist);
#endif
#if RDT
st3 = __rdtsc();
#endif
int eb = end_bonus;
// #pragma omp parallel num_threads(numThreads)
{
int32_t i;
uint16_t tid = omp_get_thread_num();
// uint16_t tid = 0;
uint8_t *mySeq1SoA = NULL;
mySeq1SoA = seq1SoA + tid * MAX_SEQ_LEN8 * SIMD_WIDTH8;
uint8_t *mySeq2SoA = NULL;
mySeq2SoA = seq2SoA + tid * MAX_SEQ_LEN8 * SIMD_WIDTH8;
assert(mySeq1SoA != NULL && mySeq2SoA != NULL);
uint8_t *seq1;
uint8_t *seq2;
uint8_t h0[SIMD_WIDTH8] __attribute__((aligned(64)));
uint8_t band[SIMD_WIDTH8];
uint8_t qlen[SIMD_WIDTH8] __attribute__((aligned(64)));
int32_t bsize = 0;
int8_t *H1 = H8_ + tid * SIMD_WIDTH8 * MAX_SEQ_LEN8;
int8_t *H2 = H8__ + tid * SIMD_WIDTH8 * MAX_SEQ_LEN8;
__m256i zero256 = _mm256_setzero_si256();
__m256i e_ins256 = _mm256_set1_epi8(e_ins);
__m256i oe_ins256 = _mm256_set1_epi8(o_ins + e_ins);
__m256i o_del256 = _mm256_set1_epi8(o_del);
__m256i e_del256 = _mm256_set1_epi8(e_del);
__m256i eb_ins256 = _mm256_set1_epi8(eb - o_ins);
__m256i eb_del256 = _mm256_set1_epi8(eb - o_del);
int8_t max = 0;
if (max < w_match) max = w_match;
if (max < w_mismatch) max = w_mismatch;
if (max < w_ambig) max = w_ambig;
int nstart = 0, nend = numPairs;
// #pragma omp for schedule(dynamic, 128)
for(i = nstart; i < nend; i+=SIMD_WIDTH8)
{
int32_t j, k;
uint8_t maxLen1 = 0;
uint8_t maxLen2 = 0;
bsize = w;
uint64_t tim;
for(j = 0; j < SIMD_WIDTH8; j++)
{
SeqPair sp = pairArray[i + j];
h0[j] = sp.h0;
seq1 = seqBufRef + (int64_t)sp.idr;
for(k = 0; k < sp.len1; k++)
{
mySeq1SoA[k * SIMD_WIDTH8 + j] = (seq1[k] == AMBIG?0xFF:seq1[k]);
H2[k * SIMD_WIDTH8 + j] = 0;
}
qlen[j] = sp.len2 * max;
if(maxLen1 < sp.len1) maxLen1 = sp.len1;
}
for(j = 0; j < SIMD_WIDTH8; j++)
{
SeqPair sp = pairArray[i + j];
for(k = sp.len1; k <= maxLen1; k++) //removed "="
{
mySeq1SoA[k * SIMD_WIDTH8 + j] = DUMMY1;
H2[k * SIMD_WIDTH8 + j] = DUMMY1;
}
}
//--------------------
__m256i h0_256 = _mm256_load_si256((__m256i*) h0);
_mm256_store_si256((__m256i *) H2, h0_256);
__m256i tmp256 = _mm256_sub_epi8(h0_256, o_del256);
for(k = 1; k < maxLen1; k++) {
tmp256 = _mm256_sub_epi8(tmp256, e_del256);
__m256i tmp256_ = _mm256_max_epi8(tmp256, zero256);
_mm256_store_si256((__m256i *)(H2 + k* SIMD_WIDTH8), tmp256_);
}
//-------------------
for(j = 0; j < SIMD_WIDTH8; j++)
{
SeqPair sp = pairArray[i + j];
seq2 = seqBufQer + (int64_t)sp.idq;
if (sp.len2 > MAX_SEQ_LEN8) fprintf(stderr, "Error !! : %d %d\n", sp.id, sp.len2);
assert(sp.len2 < MAX_SEQ_LEN8);
for(k = 0; k < sp.len2; k++)
{
mySeq2SoA[k * SIMD_WIDTH8 + j] = (seq2[k]==AMBIG?0xFF:seq2[k]);
H1[k * SIMD_WIDTH8 + j] = 0;
}
if(maxLen2 < sp.len2) maxLen2 = sp.len2;
}
for(j = 0; j < SIMD_WIDTH8; j++)
{
SeqPair sp = pairArray[i + j];
for(k = sp.len2; k <= maxLen2; k++)
{
mySeq2SoA[k * SIMD_WIDTH8 + j] = DUMMY2;
H1[k * SIMD_WIDTH8 + j] = 0;
}
}
//------------------------
_mm256_store_si256((__m256i *) H1, h0_256);
__m256i cmp256 = _mm256_cmpgt_epi8(h0_256, oe_ins256);
tmp256 = _mm256_sub_epi8(h0_256, oe_ins256);
// _mm256_store_si256((__m256i *) (H1 + SIMD_WIDTH8), tmp256);
tmp256 = _mm256_blendv_epi8(zero256, tmp256, cmp256);
_mm256_store_si256((__m256i *) (H1 + SIMD_WIDTH8), tmp256);
for(k = 2; k < maxLen2; k++)
{
// __m256i h1_256 = _mm256_load_si256((__m256i *) (H1 + (k-1) * SIMD_WIDTH8));
__m256i h1_256 = tmp256;
tmp256 = _mm256_sub_epi8(h1_256, e_ins256);
tmp256 = _mm256_max_epi8(tmp256, zero256);
_mm256_store_si256((__m256i *)(H1 + k*SIMD_WIDTH8), tmp256);
}
//------------------------
/* Banding calculation in pre-processing */
uint8_t myband[SIMD_WIDTH8] __attribute__((aligned(64)));
uint8_t temp[SIMD_WIDTH8] __attribute__((aligned(64)));
{
__m256i qlen256 = _mm256_load_si256((__m256i *) qlen);
__m256i sum256 = _mm256_add_epi8(qlen256, eb_ins256);
_mm256_store_si256((__m256i *) temp, sum256);
for (int l=0; l<SIMD_WIDTH8; l++)
{
double val = temp[l]/e_ins + 1.0;
int max_ins = (int) val;
max_ins = max_ins > 1? max_ins : 1;
myband[l] = min_(bsize, max_ins);
}
sum256 = _mm256_add_epi8(qlen256, eb_del256);
_mm256_store_si256((__m256i *) temp, sum256);
for (int l=0; l<SIMD_WIDTH8; l++) {
double val = temp[l]/e_del + 1.0;
int max_ins = (int) val;
max_ins = max_ins > 1? max_ins : 1;
myband[l] = min_(myband[l], max_ins);
bsize = bsize < myband[l] ? myband[l] : bsize;
}
}
smithWaterman256_8(mySeq1SoA,
mySeq2SoA,
maxLen1,
maxLen2,
pairArray + i,
h0,
tid,
numPairs,
zdrop,
bsize,
qlen,
myband);
}
}
#if RDT
st4 = __rdtsc();
#endif
#if SORT_PAIRS // disbaled in bwa-mem2 (only used in separate benchmark bsw code)
{
// Sort the sequences according to increasing order of id
#pragma omp parallel num_threads(numThreads)
{
int32_t tid = omp_get_thread_num();
SeqPair *myTempArray = tempArray + tid * SORT_BLOCK_SIZE;
#pragma omp for
for(ii = 0; ii < roundNumPairs; ii+=SORT_BLOCK_SIZE)
{
int32_t first, last;
first = ii;
last = ii + SORT_BLOCK_SIZE;
if(last > roundNumPairs) last = roundNumPairs;
sortPairsId(pairArray + first, first, last - first, myTempArray);
}
}
_mm_free(tempArray);
}
#endif
#if RDT
st5 = __rdtsc();
setupTicks = st2 - st1;
sort1Ticks = st3 - st2;
swTicks = st4 - st3;
sort2Ticks = st5 - st4;
#endif
// free mem
_mm_free(seq1SoA);
_mm_free(seq2SoA);
return;
}
void BandedPairWiseSW::smithWaterman256_8(uint8_t seq1SoA[],
uint8_t seq2SoA[],
uint8_t nrow,
uint8_t ncol,
SeqPair *p,
uint8_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint8_t qlen[],
uint8_t myband[])
{
__m256i match256 = _mm256_set1_epi8(this->w_match);
__m256i mismatch256 = _mm256_set1_epi8(this->w_mismatch);
__m256i gapOpen256 = _mm256_set1_epi8(this->w_open);
__m256i gapExtend256 = _mm256_set1_epi8(this->w_extend);
__m256i gapOE256 = _mm256_set1_epi8(this->w_open + this->w_extend);
__m256i w_ambig_256 = _mm256_set1_epi8(this->w_ambig); // ambig penalty
__m256i five256 = _mm256_set1_epi8(5);
__m256i e_del256 = _mm256_set1_epi8(this->e_del);
__m256i oe_del256 = _mm256_set1_epi8(this->o_del + this->e_del);
__m256i e_ins256 = _mm256_set1_epi8(this->e_ins);
__m256i oe_ins256 = _mm256_set1_epi8(this->o_ins + this->e_ins);
int8_t *F = F8_ + tid * SIMD_WIDTH8 * MAX_SEQ_LEN8;
int8_t *H_h = H8_ + tid * SIMD_WIDTH8 * MAX_SEQ_LEN8;
int8_t *H_v = H8__ + tid * SIMD_WIDTH8 * MAX_SEQ_LEN8;
int lane;
int8_t i, j;
uint8_t tlen[SIMD_WIDTH8];
uint8_t tail[SIMD_WIDTH8] __attribute((aligned(64)));
uint8_t head[SIMD_WIDTH8] __attribute((aligned(64)));
int32_t minq = 10000000;
for (int l=0; l<SIMD_WIDTH8; l++) {
tlen[l] = p[l].len1;
qlen[l] = p[l].len2;
if (p[l].len2 < minq) minq = p[l].len2;
}
minq -= 1; // for gscore
__m256i tlen256 = _mm256_load_si256((__m256i *) tlen);
__m256i qlen256 = _mm256_load_si256((__m256i *) qlen);
__m256i myband256 = _mm256_load_si256((__m256i *) myband);
__m256i zero256 = _mm256_setzero_si256();
__m256i one256 = _mm256_set1_epi8(1);
__m256i two256 = _mm256_set1_epi8(2);
__m256i max_ie256 = zero256;
__m256i ff256 = _mm256_set1_epi8(0xFF);
__m256i tail256 = qlen256, head256 = zero256;
_mm256_store_si256((__m256i *) head, head256);
_mm256_store_si256((__m256i *) tail, tail256);
//__m256i ib256 = _mm256_add_epi8(qlen256, qlen256);
// ib256 = _mm256_sub_epi8(ib256, one256);
__m256i mlen256 = _mm256_add_epi8(qlen256, myband256);
mlen256 = _mm256_min_epu8(mlen256, tlen256);
uint8_t temp[SIMD_WIDTH8] __attribute((aligned(64)));
uint8_t temp1[SIMD_WIDTH8] __attribute((aligned(64)));