Commit c48f3ca3 authored by Marc's avatar Marc
Browse files

loop unroll in grayscale

parent 4a5bb89e
VER=$(VNAME)2020.2
BID=4254
VER=$(VNAME)2020.4
BID=4261
CC=gcc
SED=sed
......@@ -510,7 +510,7 @@ ghdl-time: $(SIMTOP)
./$(SIMTOP) $(GHDLRUNOPT)
ghdl-clean:
-rm -rf gnu $(SIMTOP) make.ghdl *.ghw *.vcd
-rm -rf gnu $(SIMTOP) make.ghdl *.ghw *.vcd *.fst
######### NcSim targets ############
......
......@@ -172,10 +172,12 @@ architecture rtl of iu3 is
constant S1_SSUB : std_logic_vector (4 downto 0) :="01110";
constant S1_SMUL : std_logic_vector (4 downto 0) :="01111";
constant S1_MOVB : std_logic_vector (4 downto 0) :="10000";
constant S1_SHFT : std_logic_vector (4 downto 0) :="10001";
constant S1_UMUL : std_logic_vector (4 downto 0) :="10011";
constant S1_UDIV : std_logic_vector (4 downto 0) :="10100";
constant S1_UMAX : std_logic_vector (4 downto 0) :="10101";
constant S1_UMIN : std_logic_vector (4 downto 0) :="10110";
constant S1_SSHFT : std_logic_vector (4 downto 0):="11001";
constant S1_USADD : std_logic_vector (4 downto 0):="11101";
constant S1_USSUB : std_logic_vector (4 downto 0):="11110";
constant S1_USMUL : std_logic_vector (4 downto 0):="11111";
......@@ -2344,7 +2346,7 @@ end;
immediate_data(0) := inst(0);
-- same as for signed multiplication/division but no negatives, instead inst(4) is used with inst(0)
-- adds inst(4)inst(0) (allowing 3, 5, 6, 7, 9, 10, 11...)
when S1_UMUL | S1_USMUL | S1_UDIV =>
when S1_UMUL | S1_USMUL =>
immediate_data(rhzeros + 1) := '1';
immediate_data(1 downto 0) := inst(4) & inst(0);
-- same as for multiplication but starts at 0
......@@ -2368,6 +2370,8 @@ end;
immediate_data(rhzeros + 1) := '1';
immediate_data(0) := inst(0);
end if;
when S1_SHFT | S1_SSHFT =>
immediate_data := (7 downto 5 => inst(4)) & inst(4 downto 0);
when others =>
immediate_data := "000" & inst(4 downto 0);
end case;
......
package version is
constant grlib_version : integer := 2020200;
constant grlib_build : integer := 4254;
constant grlib_version : integer := 2020400;
constant grlib_build : integer := 4261;
end;
......@@ -249,7 +249,7 @@ architecture rtl of simd_module is
sel := "00"; -- result as it is, no saturation
if sat = '1' and ovf = '1' then
if sign = '1' then
if asign = bsign then -- positive result
if asign = '0' then
sel := "01"; -- result is 7f signed max
else
sel := "10"; -- result is 80 signed min
......@@ -280,9 +280,11 @@ architecture rtl of simd_module is
variable z : std_logic_vector(VLEN downto 0);
variable mux : std_logic_vector(1 downto 0);
variable res : vector_component;
variable ovf : std_logic;
begin
z := ((sign and a(a'left))&a) + ((sign and b(b'left))&b);
mux := sat_mux(a(a'left), b(b'left), sign, sat, z(z'left));
ovf := z(z'left) or (z(a'left) and sign);
mux := sat_mux(a(a'left), b(b'left), sign, sat, ovf);
sat_sel(mux, z(vector_component'range), res);
return res;
end add;
......@@ -308,7 +310,11 @@ architecture rtl of simd_module is
mux := sat_mux(tmp(0)(tmp(0)'left), tmp(1)(tmp(1)'left), sign, sat, ovf);
sat_sel(mux, acc, res);
if(sign = '1') then
z := std_logic_vector(resize(signed(res), word'length));
if (sat = '1') then
z := std_logic_vector(resize(signed(res), word'length));
else
z := std_logic_vector(resize(signed(res(vector_component'range)), word'length));
end if;
else
z := std_logic_vector(resize(unsigned(res), word'length));
end if;
......@@ -330,7 +336,7 @@ architecture rtl of simd_module is
end if;
elsif sign = '0' and sat = '1' then
if z(z'left) = '1' then
z(vector_component'range) := (others => '1');
z(vector_component'range) := (others => '0');
end if;
end if;
return z(vector_component'range);
......
......@@ -11,6 +11,7 @@ XCFLAGS0=-O0 -g -msoft-float -mcpu=v8
bin_change:
g++ -o bin_change bin_change.cc
g++ -o bin_change2 bin_change2.cc
make.x:
g++ -o make.x make_simd_op.cc
systest:
......@@ -62,18 +63,18 @@ mat_mul: mat_mul.c
cp mat_mul.srec $(CURRENT_DIR)/test.srec
sparc-gaisler-elf-objdump -d mat_mul.o > mat_mul.dump
grayscale: grayscale.c arrays.h image256.h
grayscale: grayscale.c arrays.h image256.h image32.h
$(XCC) $(XCFLAGS) -D N=$(par1) -c $<
$(XCC) $(XCFLAGS) grayscale.o -o grayscale.exe
sparc-gaisler-elf-objcopy -O srec --gap-fill 0 grayscale.exe grayscale.srec
cp grayscale.srec $(CURRENT_DIR)/test.srec
sparc-gaisler-elf-objdump -d grayscale.o > grayscale.dump
grayscale_simd: grayscale_simd.c arrays.h image256.h
grayscale_simd: grayscale_simd.c arrays.h image256.h image32.h
$(XCC) $(XCFLAGS) -D N=$(par1) -c $<
$(XCC) $(XCFLAGS) grayscale_simd.o -o grayscale_simd.exe
sparc-gaisler-elf-objcopy -O srec --gap-fill 0 grayscale_simd.exe tmp.grayscale_simd.srec
./bin_change tmp.grayscale_simd.srec grayscale_simd.list grayscale_simd.srec 83320001
./bin_change2 tmp.grayscale_simd.srec grayscale_simd.list grayscale_simd.srec 83386002
cp grayscale_simd.srec $(CURRENT_DIR)/test.srec
sparc-gaisler-elf-objdump -d grayscale_simd.o > grayscale_simd.dump
......@@ -110,4 +111,4 @@ mat_mul_simd32: mat_mul_simd32.o bin_change
sparc-gaisler-elf-objdump -d mat_mul_simd32.o > mat_mul_simd32.dump
clean:
rm -f *.exe *.o *.dump bin_change make.x tmp*.*
rm -f *.exe *.o *.dump bin_change2 bin_change make.x tmp*.*
#include "image256.h"
#include "image32.h"
#ifndef N
#define N 2
#endif
#if N == 2
#define IMAGE_ARRAY 255,0,0,255,255,255,0,255,255,255,0,255,255,0,0,255
#define IMAGE_ARRAY 255,0,0,0, 255,255,0,0, 255,255,0,0, 255,0,0,0
#elif N == 4
#define IMAGE_ARRAY 255,255,255,255,255,0,0,255,255,0,0,255,255,255,255,255,255,0,0,255,255,178,127,255,0,19,127,255,255,255,255,255,255,255,255,255,0,19,127,255,255,216,0,255,0,19,127,255,255,255,255,255,127,51,0,255,0,19,127,255,127,51,0,255
#define IMAGE_ARRAY 255,255,255,0,255,0,0,0,255,0,0,0,255,255,255,0,255,0,0,0,255,178,127,0,0,19,127,0,255,255,255,0,255,255,255,0,0,19,127,0,255,216,0,0,0,19,127,0,255,255,255,0,127,51,0,0,0,19,127,0,127,51,0,0
#elif N == 8
#define IMAGE_ARRAY 66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,66,165,245,255,248,187,208,255,248,187,208,255,248,187,208,255,248,187,208,255,66,165,245,255,66,165,245,255,66,165,245,255,248,187,208,255,248,187,208,255,248,187,208,255,248,187,208,255,248,187,208,255,248,187,208,255,66,165,245,255,66,165,245,255,244,143,177,255,248,187,208,255,0,0,0,255,248,187,208,255,0,0,0,255,248,187,208,255,66,165,245,255,244,143,177,255,248,187,208,255,248,187,208,255,26,35,126,255,248,187,208,255,26,35,126,255,248,187,208,255,244,143,177,255,244,143,177,255,244,143,177,255,240,98,146,255,248,187,208,255,248,187,208,255,248,187,208,255,240,98,146,255,244,143,177,255,66,165,245,255,173,20,87,255,244,143,177,255,244,143,177,255,248,187,208,255,248,187,208,255,173,20,87,255,66,165,245,255,76,175,80,255,173,20,87,255,194,24,91,255,194,24,91,255,76,175,80,255,173,20,87,255,194,24,91,255,76,175,80,255
#define IMAGE_ARRAY 66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,66,165,245,0,248,187,208,0,248,187,208,0,248,187,208,0,248,187,208,0,66,165,245,0,66,165,245,0,66,165,245,0,248,187,208,0,248,187,208,0,248,187,208,0,248,187,208,0,248,187,208,0,248,187,208,0,66,165,245,0,66,165,245,0,244,143,177,0,248,187,208,0,0,0,0,0,248,187,208,0,0,0,0,0,248,187,208,0,66,165,245,0,244,143,177,0,248,187,208,0,248,187,208,0,26,35,126,0,248,187,208,0,26,35,126,0,248,187,208,0,244,143,177,0,244,143,177,0,244,143,177,0,240,98,146,0,248,187,208,0,248,187,208,0,248,187,208,0,240,98,146,0,244,143,177,0,66,165,245,0,173,20,87,0,244,143,177,0,244,143,177,0,248,187,208,0,248,187,208,0,173,20,87,0,66,165,245,0,76,175,80,0,173,20,87,0,194,24,91,0,194,24,91,0,76,175,80,0,173,20,87,0,194,24,91,0,76,175,80,0
#elif N == 16
#define IMAGE_ARRAY 96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,229,57,53,0,229,57,53,0,255,255,255,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,229,57,53,0,255,255,255,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,0,0,0,0,229,57,53,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,0,0,0,0,0,0,0,0,229,57,53,0,229,57,53,0,229,57,53,0,0,0,0,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,229,57,53,0,229,57,53,0,0,0,0,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,229,57,53,0,229,57,53,0,0,0,0,0,96,125,139,0,96,125,139,0,0,0,0,0,158,158,158,0,255,255,255,0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,255,255,255,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,158,158,158,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,0,255,255,255,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,158,158,158,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,158,158,158,0,158,158,158,0,255,255,255,0,255,255,255,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0,96,125,139,0
#elif N == 32
#define IMAGE_ARRAY IMAGE32
#else
#define IMAGE_ARRAY IMAGE256
#endif
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
int change (string &line, string pattern, string new_inst){
int res = 0;
size_t pos;
pos = line.find(pattern);
while (pos != string::npos){
line.replace(pos,pattern.length(),new_inst);
res++;
pos = line.find(pattern);
}
return res;
}
int main(int argc, char* argv[]){
if(argc != 5) {
cout<<"Usage: bin_change original_file list_file output_file pattern\n";
exit(0);
}
string pattern = argv[4];
ofstream output;
ifstream list, source;
int sum = 0;
string line;
string new_inst;
source.open(argv[1]);
list.open(argv[2]);
output.open(argv[3],ios::trunc);
if(source.is_open())
{
if(list.is_open())
{
if(!getline(list,new_inst))
cout<<"Provided list is empty\n";
else {
if(output.is_open()){
while(getline(source,line)){
sum+=change(line,pattern,new_inst);
output<<line<<"\n";
}
output.close();
if(getline(list,line)) cout<<"Not enough lines in source for the provided list\n";
cout<<"Substitution completed, total of "<<sum<<" substitutions done\n";
}
else cout << "Unable to open output file\n";
}
}
else cout<<"Unable to open list file\n";
source.close();
list.close();
}
else cout<<"Unable to open source file\n";
return 0;
}
......@@ -6,38 +6,43 @@
#define N 2
#endif
void init(unsigned char a[N][N][4]){
for (int i = 0; i<N; i++)
for (int j = 0; j<N; j++){
a[i][j][0] = rand()%255;
a[i][j][1] = rand()%255;
a[i][j][2] = rand()%255;
a[i][j][3] = 255;
}
}
unsigned char shift_and_add(unsigned char r, unsigned char g, unsigned char b){
unsigned char ret;
ret = (r>>2) + (g>>2) + (b>>2);
return ret;
}
//void init(unsigned char a[N][N][4]){
// for (int i = 0; i<N; i++)
// for (int j = 0; j<N; j++){
// a[i][j][0] = rand()%255;
// a[i][j][1] = rand()%255;
// a[i][j][2] = rand()%255;
// a[i][j][3] = 255;
// }
//}
//unsigned char shift_and_add(unsigned char r, unsigned char g, unsigned char b){
// unsigned char ret;
// ret = (r>>2) + (g>>2) + (b>>2);
// return ret;
//}
void grayscale(unsigned char src[N][N][4], unsigned char dst[N][N][3]){
__attribute__((optimize("unroll-loops")))
void grayscale(unsigned char src[N][N][4], unsigned char dst[N][N]){
unsigned char color;
asm("nop");
asm("srl %i0, %o1, %g2");
asm("nop");
for (int i = 0; i<N; i++)
for (int j = 0; j<N; j++){
color = shift_and_add(src[i][j][0], src[i][j][1], src[i][j][2]);
dst[i][j][0] = color;
dst[i][j][1] = color;
dst[i][j][2] = color;
dst[i][j] = (src[i][j][0]>>2) + (src[i][j][1]>>2) + (src[i][j][2]>>2);
// dst[i][j] = shift_and_add(src[i][j][0], src[i][j][1], src[i][j][2]);
}
asm("nop");
asm("srl %i0, %o1, %g2");
asm("nop");
}
void print(unsigned char src[N][N][3]) {
void print(unsigned char src[N][N]) {
printf("P3\n%d %d\n255\n",N,N);
for (int i = 0; i<N; i++){
for (int j = 0; j<N; j++){
printf("%d %d %d ", src[i][j][0], src[i][j][1], src[i][j][2]);
printf("%d %d %d ", src[i][j], src[i][j], src[i][j]);
}
printf("\n");
}
......@@ -46,7 +51,7 @@ void print(unsigned char src[N][N][3]) {
int main(){
unsigned char source[N][N][4] = {IMAGE_ARRAY};
unsigned char dest[N][N][3];
unsigned char dest[N][N];
//init(source);
grayscale(source, dest);
print(dest);
......
This diff is collapsed.
......@@ -6,42 +6,52 @@
#define N 2
#endif
void init(unsigned char a[N][N][4]){
for (int i = 0; i<N; i++)
for (int j = 0; j<N; j++){
a[i][j][0] = rand()%255;
a[i][j][1] = rand()%255;
a[i][j][2] = rand()%255;
a[i][j][3] = 255;
}
}
//void init(unsigned char a[N][N][4]){
// for (int i = 0; i<N; i++)
// for (int j = 0; j<N; j++){
// a[i][j][0] = rand()%255;
// a[i][j][1] = rand()%255;
// a[i][j][2] = rand()%255;
// a[i][j][3] = 255;
// }
//}
int b = 0xfcfcfc7f;
//int b = 0xfcfcfc7f;
//int shift_and_add(int a){
//int r;
//asm("srl %1, %0, %0"
// : "=r"(r)
// : "r"(a), "0"(b));
//return r;
//}
int shift_and_add(int a);
asm("shift_and_add:");
asm("retl");
asm("srl %o0, %g1, %g1");
int shift_and_add(int a){
int r;
asm("srl %1, %0, %0"
: "=r"(r)
: "r"(a), "0"(b));
return r;
}
void grayscale(unsigned char src[N][N][4], unsigned char dst[N][N][3]){
__attribute__((optimize("unroll-loops")))
void grayscale(unsigned char src[N][N][4], unsigned char dst[N][N]){
unsigned char color;
asm("nop");
asm("srl %i0, %o1, %g2");
asm("nop");
for (int i = 0; i<N; i++)
for (int j = 0; j<N; j++){
color = shift_and_add(*((int *) &src[i][j][0]));
dst[i][j][0] = color;
dst[i][j][1] = color;
dst[i][j][2] = color;
dst[i][j]= *((int *) &src[i][j][0]) >> 2; //shift_and_add(*((int *) &src[i][j][0]));
}
asm("nop");
asm("srl %i0, %o1, %g2");
asm("nop");
}
void print(unsigned char src[N][N][3]) {
void print(unsigned char src[N][N]) {
printf("P3\n%d %d\n255\n",N,N);
for (int i = 0; i<N; i++){
for (int j = 0; j<N; j++){
printf("%d %d %d ", src[i][j][0], src[i][j][1], src[i][j][2]);
printf("%d %d %d ", src[i][j], src[i][j], src[i][j]);
}
printf("\n");
}
......@@ -50,7 +60,7 @@ void print(unsigned char src[N][N][3]) {
int main(){
unsigned char source[N][N][4] = {IMAGE_ARRAY};
unsigned char dest[N][N][3];
unsigned char dest[N][N];
//init(source);
grayscale(source, dest);
print(dest);
......
This diff is collapsed.
#define
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment