Commit b34bb43b authored by Santiago Marco-Sola's avatar Santiago Marco-Sola
Browse files

Fixed compilation flags and removed mm includes

parent efed43e3
......@@ -27,19 +27,18 @@
##Contacts: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@intel.com>;
## Heng Li <hli@jimmy.harvard.edu>
##*****************************************************************************************/
EPI_TOOLCHAIN=/scratch/workspace-riscv/llvm-EPI-0.7-release-toolchain-cross-1602/
CC=$(EPI_TOOLCHAIN)/bin/clang
CXX=$(EPI_TOOLCHAIN)/bin/clang++
FMI=fmi
CC=gcc
CXX=g++
BWAMEM2_PATH=./bwa-mem2
ARCH_FLAGS=-march=native
CXXFLAGS=-std=c++11 -fopenmp $(ARCH_FLAGS) #-mtune=native -march=native
#CPPFLAGS=-DPRINT_OUTPUT -DENABLE_PREFETCH -DBWA_OTHER_ELE=0
CXXFLAGS=-std=c++11 -fopenmp
CPPFLAGS=-DENABLE_PREFETCH -DBWA_OTHER_ELE=0
INCLUDES=-I$(BWAMEM2_PATH)/src -I$(BWAMEM2_PATH)/ext/safestringlib/include # -DENABLE_PARSEC_HOOKS -I/romol/hooks/include
LIBS=-L$(BWAMEM2_PATH) -L$(BWAMEM2_PATH)/ext/safestringlib -lsafestring -fopenmp -lz -lbwa -ldl # -L/romol/hooks/lib -lhooks
INCLUDES=-I$(BWAMEM2_PATH)/src -I$(BWAMEM2_PATH)/ext/safestringlib/include
LIBS=-L$(BWAMEM2_PATH) -L$(BWAMEM2_PATH)/ext/safestringlib -lsafestring -fopenmp -lz -lbwa -ldl
.PHONY:all clean depend
.SUFFIXES:.cpp .o
......
......@@ -27,10 +27,11 @@
##Contacts: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@intel.com>;
## Heng Li <hli@jimmy.harvard.edu>
##*****************************************************************************************/
EPI_TOOLCHAIN=/scratch/workspace-riscv/llvm-EPI-0.7-release-toolchain-cross-1602/
CC=$(EPI_TOOLCHAIN)/bin/clang
CXX=$(EPI_TOOLCHAIN)/bin/clang++
EXE=bwa-mem2
CC=gcc
CXX=g++
MEM_FLAGS=-DSAIS=1
CPPFLAGS+=-DENABLE_PREFETCH -DV17=1 $(MEM_FLAGS)
......
EPI_TOOLCHAIN=/scratch/workspace-riscv/llvm-EPI-0.7-release-toolchain-cross-1602/
CC=$(EPI_TOOLCHAIN)/bin/clang
CXX=$(EPI_TOOLCHAIN)/bin/clang++
IDIR = include
MKDIR_P = mkdir -p
CC=gcc
CFLAGS=-I$(IDIR) -fPIE -fPIC -O2 -D_FORTIFY_SOURCE=2 -Wformat -Wformat-security
LDFLAGS=-z noexecstack -z relo -z now
......
......@@ -34,7 +34,6 @@ Authors: Sanchit Misra <sanchit.misra@intel.com>; Vasimuddin Md <vasimuddin.md@i
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <immintrin.h>
#include <limits.h>
#include <fstream>
......
......@@ -37,13 +37,8 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
#include "macro.h"
#include "utils.h"
#if (__AVX512BW__ || __AVX2__)
#include <immintrin.h>
#else
#include <smmintrin.h> // for SSE4.1
#define __mmask8 uint8_t
#define __mmask16 uint16_t
#endif
#define MAX_SEQ_LEN_REF 256
#define MAX_SEQ_LEN_QER 128
......@@ -135,183 +130,6 @@ public:
int nthreads,
int32_t w);
#if ((!__AVX512BW__) & (!__AVX2__) & (__SSE2__))
// AVX256 is not updated for banding and separate ins/del in the inner loop.
// 8 bit vector code section
void getScores8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman128_8(uint8_t seq1SoA[],
uint8_t seq2SoA[],
uint8_t nrow,
uint8_t ncol,
SeqPair *p,
uint8_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint8_t qlen[],
uint8_t myband[]);
// 16 bit vector code section
void getScores16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman128_16(uint16_t seq1SoA[],
uint16_t seq2SoA[],
uint16_t nrow,
uint16_t ncol,
SeqPair *p,
uint16_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint16_t qlen[],
uint16_t myband[]);
#endif //SSE2
#if ((!__AVX512BW__) & (__AVX2__))
// AVX256 is not updated for banding and separate ins/del in the inner loop.
// 8 bit vector code section
void getScores8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman256_8(uint8_t seq1SoA[],
uint8_t seq2SoA[],
uint8_t nrow,
uint8_t ncol,
SeqPair *p,
uint8_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint8_t qlen[],
uint8_t myband[]);
// 16 bit vector code section
void getScores16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman256_16(uint16_t seq1SoA[],
uint16_t seq2SoA[],
uint16_t nrow,
uint16_t ncol,
SeqPair *p,
uint16_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint16_t qlen[],
uint16_t myband[],
int64_t* nCellsComputed);
#endif //axv2
#if __AVX512BW__
// 8 bit vector code section
void getScores8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper8(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman512_8(uint8_t seq1SoA[],
uint8_t seq2SoA[],
uint8_t nrow,
uint8_t ncol,
SeqPair *p,
uint8_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint8_t qlen[],
uint8_t myband[]);
// 16 bit vector code section
void getScores16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWatermanBatchWrapper16(SeqPair *pairArray,
uint8_t *seqBufRef,
uint8_t *seqBufQer,
int32_t numPairs,
uint16_t numThreads,
int32_t w);
void smithWaterman512_16(uint16_t seq1SoA[],
uint16_t seq2SoA[],
uint16_t nrow,
uint16_t ncol,
SeqPair *p,
uint16_t h0[],
uint16_t tid,
int32_t numPairs,
int zdrop,
int32_t w,
uint16_t qlen[],
uint16_t myband[]);
#endif
int64_t getTicks();
private:
......
......@@ -30,7 +30,6 @@
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <emmintrin.h>
#include "ksw.h"
#include "macro.h"
......@@ -108,235 +107,6 @@ kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t
return q;
}
kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target,
int _o_del, int _e_del, int _o_ins, int _e_ins,
int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
} while (0)
// initialization
r = g_defr;
minsc = (xtra & KSW_XSUBO)? xtra & 0xffff : 0x10000;
endsc = (xtra & KSW_XSTOP)? xtra & 0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi8(_o_del + _e_del);
e_del = _mm_set1_epi8(_e_del);
oe_ins = _mm_set1_epi8(_o_ins + _e_ins);
e_ins = _mm_set1_epi8(_e_ins);
shift = _mm_set1_epi8(q->shift);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, cmp, imax;
__m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian
for (j = 0; LIKELY(j < slen); ++j) {
/* SW cells are computed in the following order:
* H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)}
* E(i+1,j) = max{H(i,j)-q, E(i,j)-r}
* F(i,j+1) = max{H(i,j)-q, F(i,j)-r}
*/
// compute H'(i,j); note that at the beginning, h=H'(i-1,j-1)
h = _mm_adds_epu8(h, _mm_load_si128(S + j));
h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j)
e = _mm_load_si128(E + j); // e=E'(i,j)
h = _mm_max_epu8(h, e);
h = _mm_max_epu8(h, f); // h=H'(i,j)
max = _mm_max_epu8(max, h); // set max
_mm_store_si128(H1 + j, h); // save to H'(i,j)
// now compute E'(i+1,j)
e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del
t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del
e = _mm_max_epu8(e, t); // e=E'(i+1,j)
_mm_store_si128(E + j, e); // save to E'(i+1,j)
// now compute F'(i,j+1)
f = _mm_subs_epu8(f, e_ins);
t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins
f = _mm_max_epu8(f, t);
// get H'(i-1,j) and prepare for the next j
h = _mm_load_si128(H0 + j); // h=H'(i-1,j)
}
// NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion
for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max
f = _mm_slli_si128(f, 1);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epu8(h, f); // h=H'(i,j)
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu8(h, oe_ins);
f = _mm_subs_epu8(f, e_ins);
cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero));
if (UNLIKELY(cmp == 0xffff)) goto end_loop16;
}
}
end_loop16:
__max_16(imax, max); // imax is the maximum number in max
if (imax >= minsc) { // write the b array; this condition adds branching unfornately
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*) realloc (b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i; // te is the end position on the target
for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax + q->shift >= 255 || gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S; // swap H0 and H1
}
r.score = gmax + q->shift < 255? gmax : 255;
r.te = te;
if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score
int max = -1, tmp, low, high, qlen = slen * 16;
uint8_t *t = (uint8_t*) Hmax;
for (i = 0; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen;
else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp;
//printf("%d,%d\n", max, gmax);
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return r;
}
kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e)
{
int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc;
uint64_t *b;
__m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax;
kswr_t r;
#define __max_8(ret, xx) do { \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
(ret) = _mm_extract_epi16((xx), 0); \
} while (0)
// initialization
r = g_defr;
minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000;
endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000;
m_b = n_b = 0; b = 0;
zero = _mm_set1_epi32(0);
oe_del = _mm_set1_epi16(_o_del + _e_del);
e_del = _mm_set1_epi16(_e_del);
oe_ins = _mm_set1_epi16(_o_ins + _e_ins);
e_ins = _mm_set1_epi16(_e_ins);
H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
slen = q->slen;
for (i = 0; i < slen; ++i) {
_mm_store_si128(E + i, zero);
_mm_store_si128(H0 + i, zero);
_mm_store_si128(Hmax + i, zero);
}
// the core loop
for (i = 0; i < tlen; ++i) {
int j, k, imax;
__m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
h = _mm_slli_si128(h, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_adds_epi16(h, *S++);
e = _mm_load_si128(E + j);
h = _mm_max_epi16(h, e);
h = _mm_max_epi16(h, f);
max = _mm_max_epi16(max, h);
_mm_store_si128(H1 + j, h);
e = _mm_subs_epu16(e, e_del);
t = _mm_subs_epu16(h, oe_del);
e = _mm_max_epi16(e, t);
_mm_store_si128(E + j, e);
f = _mm_subs_epu16(f, e_ins);
t = _mm_subs_epu16(h, oe_ins);
f = _mm_max_epi16(f, t);
h = _mm_load_si128(H0 + j);
}
for (k = 0; LIKELY(k < 16); ++k) {
f = _mm_slli_si128(f, 2);
for (j = 0; LIKELY(j < slen); ++j) {
h = _mm_load_si128(H1 + j);
h = _mm_max_epi16(h, f);
_mm_store_si128(H1 + j, h);
h = _mm_subs_epu16(h, oe_ins);
f = _mm_subs_epu16(f, e_ins);
if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8;
}
}
end_loop8:
__max_8(imax, max);
if (imax >= minsc) {
if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) {
if (n_b == m_b) {
m_b = m_b? m_b<<1 : 8;
b = (uint64_t*)realloc(b, 8 * m_b);
}
b[n_b++] = (uint64_t)imax<<32 | i;
} else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last
}
if (imax > gmax) {
gmax = imax; te = i;
for (j = 0; LIKELY(j < slen); ++j)
_mm_store_si128(Hmax + j, _mm_load_si128(H1 + j));
if (gmax >= endsc) break;
}
S = H1; H1 = H0; H0 = S;
}
r.score = gmax; r.te = te;
{
int max = -1, tmp, low, high, qlen = slen * 8;
uint16_t *t = (uint16_t*)Hmax;
for (i = 0, r.qe = -1; i < qlen; ++i, ++t)
if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen;
else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp;
if (b) {
i = (r.score + q->max - 1) / q->max;
low = te - i; high = te + i;
for (i = 0; i < n_b; ++i) {
int e = (int32_t)b[i];
if ((e < low || e > high) && (int)(b[i]>>32) > r.score2)
r.score2 = b[i]>>32, r.te2 = e;
}
}
}
free(b);
return r;
}
static inline void revseq(int l, uint8_t *s)
{
int i, t;
......
......@@ -26,7 +26,6 @@
#define __AC_KSW_H
#include <stdint.h>
#include <emmintrin.h>
#define KSW_XBYTE 0x10000
#define KSW_XSTOP 0x20000
......
......@@ -38,8 +38,6 @@ Authors: Vasimuddin Md <vasimuddin.md@intel.com>; Sanchit Misra <sanchit.misra@i
#if !MAINY
#include "ksw.h"
#include "bandedSWA.h"
#else
#include <immintrin.h>
#endif
#ifdef __GNUC__
......@@ -108,14 +106,6 @@ typedef struct {
const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 };
#define __max_16(ret, xx) do { \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \
(xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \
(ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \
} while (0)
#define DP 6
#define DP1 7
#define DP2 8
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment