diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 9f84396f..b40e460b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,17 +2,17 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute to this The following people from multiple organizations have contributed to this project: -* (Ventana Micro Systems)[https://www.ventanamicro.com] - * (Arup Chakraborty)[https://github.com/arupc] +* [Ventana Micro Systems]([https://www.ventanamicro.com) + * [Arup Chakraborty](https://github.com/arupc) -* (MIPS)[https://mips.com] - * (Knute Lingaard)[https://github.com/klingaard] - * (Kathlene Magnus)[https://github.com/kathlenemagnus] +* [MIPS](https://mips.com) + * [Knute Lingaard](https://github.com/klingaard) + * [Kathlene Magnus](https://github.com/kathlenemagnus) -* (Condor Computing)[https://condorcomputing.com] - * (Jeff Nye)[https://github.com/jeffnye-gh] +* [Condor Computing](https://condorcomputing.com) + * [Jeff Nye](https://github.com/jeffnye-gh) -* (InCore Semiconductors)[https://incoresemi.com/] - * (Sai Govardhan)[https://github.com/govardhnn] +* [InCore Semiconductors](https://incoresemi.com/) + * [Sai Govardhan](https://github.com/govardhnn) List is incomplete and more contributor names/organizations to be added. diff --git a/arches/isa_json/gen_uarch_rv64v_json.py b/arches/isa_json/gen_uarch_rv64v_json.py index 99a7b1c9..3718a3b8 100755 --- a/arches/isa_json/gen_uarch_rv64v_json.py +++ b/arches/isa_json/gen_uarch_rv64v_json.py @@ -487,31 +487,31 @@ "vid.v" : {"pipe" : "vmask", "uop_gen" : "ELEMENTWISE", "latency" : 1}, # Vector Permutation Instructions: Integer Scalar Move Instructions - "vmv.x.s" : {"pipe" : "v2s", "uop_gen" : "NONE", "latency" : 1}, - "vmv.s.x" : {"pipe" : "vmv", "uop_gen" : "NONE", "latency" : 1}, + "vmv.x.s" : {"pipe" : "v2s", "uop_gen" : "SCALAR_MOVE", "latency" : 1}, + "vmv.s.x" : {"pipe" : "vmv", "uop_gen" : "SCALAR_MOVE", "latency" : 1}, # Vector Permutation Instructions: Floating-Point Scalar Move Instructions - "vfmv.f.s" : {"pipe" : "v2s", "uop_gen" : "NONE", "latency" : 1}, - "vfmv.s.f" : {"pipe" : "vmv", "uop_gen" : "NONE", "latency" : 1}, + "vfmv.f.s" : {"pipe" : "v2s", "uop_gen" : "SCALAR_MOVE", "latency" : 1}, + "vfmv.s.f" : {"pipe" : "vmv", "uop_gen" : "SCALAR_MOVE", "latency" : 1}, # Vector Permutation Instructions: Vector Slide Instructions - "vslideup.vx" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vslideup.vi" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vslidedown.vx" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vslidedown.vi" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, + "vslideup.vx" : {"pipe" : "vpermute", "uop_gen" : "SLIDEUP", "latency" : 6}, + "vslideup.vi" : {"pipe" : "vpermute", "uop_gen" : "SLIDEUP", "latency" : 6}, + "vslidedown.vx" : {"pipe" : "vpermute", "uop_gen" : "SLIDEDOWN", "latency" : 6}, + "vslidedown.vi" : {"pipe" : "vpermute", "uop_gen" : "SLIDEDOWN", "latency" : 6}, "vslide1up.vx" : {"pipe" : "vint", "uop_gen" : "SLIDE1UP", "latency" : 1}, "vfslide1up.vf" : {"pipe" : "vfloat", "uop_gen" : "SLIDE1UP", "latency" : 1}, "vslide1down.vx" : {"pipe" : "vint", "uop_gen" : "SLIDE1DOWN", "latency" : 1}, "vfslide1down.vf": {"pipe" : "vfloat", "uop_gen" : "SLIDE1DOWN", "latency" : 1}, # Vector Permutation Instructions: Vector Register Gather Instructions - "vrgather.vv" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vrgatherei16.vv": {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vrgather.vx" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, - "vrgather.vi" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, + "vrgather.vv" : {"pipe" : "vpermute", "uop_gen" : "RGATHER", "latency" : 6}, + "vrgatherei16.vv": {"pipe" : "vpermute", "uop_gen" : "RGATHER", "latency" : 6}, + "vrgather.vx" : {"pipe" : "vpermute", "uop_gen" : "RGATHER", "latency" : 6}, + "vrgather.vi" : {"pipe" : "vpermute", "uop_gen" : "RGATHER", "latency" : 6}, # Vector Permutation Instructions: Vector Compress Instruction - "vcompress.vm" : {"pipe" : "vpermute", "uop_gen" : "PERMUTE", "latency" : 6}, + "vcompress.vm" : {"pipe" : "vpermute", "uop_gen" : "COMPRESS", "latency" : 6}, # Vector Permutation Instructions: Whole Vector Register Move "vmv1r.v" : {"pipe" : "vmv", "uop_gen" : "ELEMENTWISE", "latency" : 1}, diff --git a/arches/isa_json/olympia_uarch_rv64v.json b/arches/isa_json/olympia_uarch_rv64v.json index 23dda4c2..441f0747 100644 --- a/arches/isa_json/olympia_uarch_rv64v.json +++ b/arches/isa_json/olympia_uarch_rv64v.json @@ -104,8 +104,8 @@ { "mnemonic": "vcompress.vm", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "COMPRESS", + "latency": 6 }, { "mnemonic": "vdiv.vv", @@ -302,13 +302,13 @@ { "mnemonic": "vfmv.f.s", "pipe": "v2s", - "uop_gen": "NONE", + "uop_gen": "SCALAR_MOVE", "latency": 1 }, { "mnemonic": "vfmv.s.f", "pipe": "vmv", - "uop_gen": "NONE", + "uop_gen": "SCALAR_MOVE", "latency": 1 }, { @@ -1328,7 +1328,7 @@ { "mnemonic": "vmv.s.x", "pipe": "vmv", - "uop_gen": "NONE", + "uop_gen": "SCALAR_MOVE", "latency": 1 }, { @@ -1352,7 +1352,7 @@ { "mnemonic": "vmv.x.s", "pipe": "v2s", - "uop_gen": "NONE", + "uop_gen": "SCALAR_MOVE", "latency": 1 }, { @@ -1586,26 +1586,26 @@ { "mnemonic": "vrgather.vi", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "RGATHER", + "latency": 6 }, { "mnemonic": "vrgather.vv", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "RGATHER", + "latency": 6 }, { "mnemonic": "vrgather.vx", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "RGATHER", + "latency": 6 }, { "mnemonic": "vrgatherei16.vv", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "RGATHER", + "latency": 6 }, { "mnemonic": "vrsub.vi", @@ -1766,26 +1766,26 @@ { "mnemonic": "vslidedown.vi", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "SLIDEDOWN", + "latency": 6 }, { "mnemonic": "vslidedown.vx", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "SLIDEDOWN", + "latency": 6 }, { "mnemonic": "vslideup.vi", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "SLIDEUP", + "latency": 6 }, { "mnemonic": "vslideup.vx", "pipe": "vpermute", - "uop_gen": "PERMUTE", - "latency": 4 + "uop_gen": "SLIDEUP", + "latency": 6 }, { "mnemonic": "vsll.vi", diff --git a/core/InstArchInfo.cpp b/core/InstArchInfo.cpp index 0f618984..3600a06a 100644 --- a/core/InstArchInfo.cpp +++ b/core/InstArchInfo.cpp @@ -77,8 +77,12 @@ namespace olympia {"INT_EXT", InstArchInfo::UopGenType::INT_EXT}, {"SLIDE1UP", InstArchInfo::UopGenType::SLIDE1UP}, {"SLIDE1DOWN", InstArchInfo::UopGenType::SLIDE1DOWN}, - {"PERMUTE", InstArchInfo::UopGenType::PERMUTE}, - {"NONE", InstArchInfo::UopGenType::NONE}}; + {"SCALAR_MOVE", InstArchInfo::UopGenType::SCALAR_MOVE}, + {"RGATHER", InstArchInfo::UopGenType::RGATHER}, + {"COMPRESS", InstArchInfo::UopGenType::COMPRESS}, + {"WHOLE_REG_MOVE", InstArchInfo::UopGenType::WHOLE_REG_MOVE}, + {"NONE", InstArchInfo::UopGenType::NONE}, + }; void InstArchInfo::update(const nlohmann::json & jobj) { diff --git a/core/InstArchInfo.hpp b/core/InstArchInfo.hpp index e1ab1e15..8c99e0ac 100644 --- a/core/InstArchInfo.hpp +++ b/core/InstArchInfo.hpp @@ -90,9 +90,14 @@ namespace olympia REDUCTION, REDUCTION_WIDE, INT_EXT, + SLIDEUP, + SLIDEDOWN, SLIDE1UP, SLIDE1DOWN, - PERMUTE, + SCALAR_MOVE, + RGATHER, + COMPRESS, + WHOLE_REG_MOVE, NONE, UNKNOWN }; diff --git a/core/vector/VectorUopGenerator.cpp b/core/vector/VectorUopGenerator.cpp index 153421b6..cca25a65 100644 --- a/core/vector/VectorUopGenerator.cpp +++ b/core/vector/VectorUopGenerator.cpp @@ -169,8 +169,15 @@ namespace olympia // Exe Uop 2: vrgather.vv v21, v9 // Exe Uop 3: vrgather.vv v22, v10 // Exe Uop 4: vrgather.vv v23, v11 - uop_gen_function_map_.emplace(InstArchInfo::UopGenType::PERMUTE, - &VectorUopGenerator::generatePermuteUops_); + // uop_gen_function_map_.emplace(InstArchInfo::UopGenType::RGATHER, + // &VectorUopGenerator::generatePermuteUops_); + + // Vector scalar move uop generator + // Integer Scalar Move + // Floating-Point Scalar Move + uop_gen_function_map_.emplace( + InstArchInfo::UopGenType::SCALAR_MOVE, + &VectorUopGenerator::generateScalarMoveUops_); } void VectorUopGenerator::onBindTreeLate_() { mavis_facade_ = getMavis(getContainer()); } @@ -359,7 +366,7 @@ namespace olympia } } - // For narrowing insturction, + // For narrowing instruction, if constexpr (Type == InstArchInfo::UopGenType::NARROWING) { sparta_assert(src_rs3.field_id != mavis::InstMetaData::OperandFieldID::NONE, @@ -479,9 +486,10 @@ namespace olympia return makeInst_(srcs, dests); } - InstPtr VectorUopGenerator::generatePermuteUops_() + template + InstPtr VectorUopGenerator::generateScalarMoveUops_() { - sparta_assert(false, "Vector permute uop generation is currently not supported!"); + sparta_assert(false, "Vector Scalar move implementation TODO ..."); } InstPtr VectorUopGenerator::makeInst_(const mavis::OperandInfo::ElementList & srcs, diff --git a/core/vector/VectorUopGenerator.hpp b/core/vector/VectorUopGenerator.hpp index 0af3b9bb..30de93fa 100644 --- a/core/vector/VectorUopGenerator.hpp +++ b/core/vector/VectorUopGenerator.hpp @@ -89,7 +89,9 @@ namespace olympia template InstPtr generateSlideUops_(); - InstPtr generatePermuteUops_(); +// InstPtr generatePermuteUops_(); + + template InstPtr generateScalarMoveUops_(); InstPtr makeInst_(const mavis::OperandInfo::ElementList & srcs, const mavis::OperandInfo::ElementList & dests); diff --git a/docs/vector_permutation.adoc b/docs/vector_permutation.adoc new file mode 100644 index 00000000..ec9a3251 --- /dev/null +++ b/docs/vector_permutation.adoc @@ -0,0 +1,565 @@ +:doctitle: Olympia Vector Permutation Design Document + +:toc: + +[[Document_Information]] +== Document Information + +TODO + +[[Revision_History]] +=== Revision History + +[width="100%",cols="11%,11%,16%,62%",options="header",] +|=== +|*Revision* |*Date* |*Author* |*Summary of Changes* +|0.1 | 2025.04.TODO | Sai Govardhan | Initial Vector Permutations +Design Document +|=== + +[[Conventions_and_Terminology]] +=== Conventions and Terminology + + +[width="100%",cols="17%,83%",options="header",] +|=== +|Label |Description +| VLSU | Vector Load Store Unit +| VLEN | Vector Register Length (1024 bits in Olympia) +| SEW | Selected Element Width +| LMUL | Vector Register Group Multiplier +| ELEN | Maximum Vector Element Width +| VTA | Vector Tail Agnostic +// TODO MORE +|=== +[[Related_Documents]] +=== Related Documents + +// + +[width="100%",cols="25%,75%",options="header",] +|=== +|*Title* |*Description* +| The RISC-V Vector ISA (v1.0) | TODO +// | Saturn Vectors | TODO +// | The vector thesis | TODO +// Cray reference? +// Chipsalliance T1? +// Tenstorrent Ocelot? +// Barcaelona Supercomputing Group slides? +|=== + +[[Notes_Open_Issues]] +=== Notes/Open Issues + +// + +* Note1 +* Note 2 + +[[OVERVIEW]] +== OVERVIEW +The following is the directory structure of olympia, for reference: + +```bash +. +├── arches +├── CMakeLists.txt +├── CodingStyle.md +├── conda +├── CONTRIBUTING.md +├── CONTRIBUTORS.md +├── core ## Consists of the vector/ directory +├── docs +├── fsl +├── layouts +├── LICENSE +├── mavis +├── mss +├── README.md +├── release +├── reports +├── sim +├── stf_lib +├── test +├── test.json +└── traces +``` + +We shall implement the Vector Permutation instructions in the `core/vector/`, make some modifications to the `core/InstGenerator.cpp` and run tests in the `test/core/vector/` directory. The following document lists down the instructions we have implemented, the changes we have made to the existing files and the architecture of these new vector perumute implementations. + +=== Configuring the Vector Unit + +Olympia implements the Vector Unit in the `core/vector/` directory where: + + - `VLEN` is the width of the vector register statically set to 1024 + + - `ELEN`, the Maximum Vector Element Width is specified based on `sew_` + (Selected Element Width) + +Within the `core/vector/VectorConfig.hpp` file, the `VectorConfig` class is +defined to configure the Vector Unit. + +``` +VectorConfig(uint32_t vl, uint32_t sew, uint32_t lmul, uint32_t vta) +``` + +A sample assembly instruction is: + +``` +vsetvli t0, a0, e32, m1 # Configure vector unit where a0 specifies the vector +length (vl_), sew_=32, lmul_=1 + +``` + +The `vlmax_`, the maximum vector length is set to `((VLEN / sew_) * lmul_)`. + +We would be using a subset of `vlmax_` by specifying the `vl_` in the vector +configuration. + +Take an example where VLEN is set to 1024, `sew_` is 32 bits and `lmul_` is 1. +Then `vlmax_` is ((1024/32)*1) = 32. Which means that there is one logical +Vector register is divided into 32 elements of 32 bits each. + +If we set Vector Length (that we would use) `vl_` to 16, then we are using 16 +elements of 32 `vlmax_` elements we could use in the logical vector register +file instance. + +Note that the `vta_` (Vector Tail Agnostic) parameter is set to false by +default, which indicates that it is undisturbed. When set to true, we are agnostic of the tail elements - and set it to 0s. + + +// + +=== How are the Vector Uops generated? + +We decode and determine the instructions as Vector instructions in the +`core/decode/Decode.cpp` file. + +```cpp +vector_enabled_(true), + vector_config_(new VectorConfig(p->init_vl, p->init_sew, p->init_lmul, p->init_vta)), +``` + +We feed Mavis with the Vector Permutation instructions in json format as specified in the +`mavis/json/isa_rv64v.json` and the `mavis/json/isa_rv64vf.json` files for both +the Base Vector instructions and the Vector Floating Point instructions. + +The `core/vector/VectorUopGenerator.hpp` file implements the Vector Uop +Generator. + + +### Adding Support to Vector Permutation instructions + +- Instruction Architecture Info: + + . `core/InstArchInfo.{hpp}/{cpp}`: + .. Already has `VPERMUTE` in TargetPipe enum + .. Need to ensure proper UopGenType for permutation, to add: + ... `SCALAR_MOVE` + ... `SLIDE1UP` + ... `SLIDE1DOWN` + ... `SLIDEUP` + ... `SLIDEDOWN` + ... `RGATHER` + ... `COMPRESS` + ... `WHOLE_REG_MOVE` + + . `mavis/json/isa_rv64v.json`: + .. Define vector permutation instruction encodings + .. Specify operand types and fields + + . `core/execute/IssueQueue.hpp`: + .. Configure scheduler for vector permute operations + + . `core/execute/Execute.cpp`: + .. Handle execution of permute operations + + . `core/vector/VectorConfig.hpp`: + .. Already has basic vector config (VLEN, SEW, LMUL) + .. May need updates for permute-specific settings + +The files we shall be modifying: + +. `core/InstArchInfo.hpp` + - UopGenType to be updated to specific implementations of Vector Permutation instructions, to remove the `PERMUTE` entry + +. `core/vector/VectorUopGenerator.hpp` + - Currently has stub for `generatePermuteUops_` + +. `core/vector/VectorUopGenerator.cpp` + - Add implementation for specific permutation ops and replace `generatePermuteUops_` + +```cpp + uop_gen_function_map_.emplace(InstArchInfo::UopGenType::PERMUTE, + &VectorUopGenerator::generatePermuteUops_); +``` + +. `test/core/vector/Vector_test.cpp`: + - Add test cases for vector permutation instructions + + +#### List of all the Vector Permutation Instructions to be implemented: + +##### Vector Scalar Move Instructions + +Integer Scalar Move + + . vmv.x.s rd, vs2 # x[rd] = vs2[0] + . vmv.s.x vd, rs1 # vd[0] = x[rs1] + +Floating-Point Scalar Move + + . vfmv.f.s rd, vs2 # f[rd] = vs2[0] (rs1=0) + . vfmv.s.f vd, rs1 # vd[0] = f[rs1] (vs2=0) + + +Key points: + + - Ignores LMUL and vector register groups + - Operates even if vstart ≥ vl or vl=0 + - Handles SEW vs XLEN width differences + +Micro-ops to be generated: + + - Since this instruction ignores the LMUL and vector register groups, we generate a single micro-op for this instruction - `SCALAR_MOVE`. + - + // - TODO REVIEW SIMPLE IMPLEMENTATION: In a simple implementation, we can permute one element per cycle - in a single cycle pipelined manner, by iterating through all the elements of the source register, check for vstart and vl, and maintain the tail agnostic policy. + + Other Handling: + - Note that if the vstart is greater than or equal to vl, then the micro-op is not generated and is treated as a no-op. + - If the vl = 0, then the micro-op is generated but the destination register is not updated. + - If the vstart is greater than or equal to vl, then the micro-op is not generated and is treated as a no-op. + - The tail elements of the destination register are set to 0 if the vta is set to true (tail agnostic), else they are left undisurbed. + +The pseudo code for the execution of the above micro-op `SCALAR_MOVE`: + +``` + +- If we are updating a scalar destination register from the vector source register + + x_dest[rd] = v_src[0]; + +- If we are updating a vector destination register from the scalar source register + + v_dest[0] = x_src[rd]; + +TODO DOUBT: How do we enforce the rest of the v_dest elements to be set to 0 if vta is true? + +##### Vector Slide Instructions + +vslideup.vx vd, vs2, rs1, vm # vd[i+rs1] = vs2[i] +vslideup.vi vd, vs2, uimm, vm # vd[i+uimm] = vs2[i] + +vslidedown.vx vd, vs2, rs1, vm # vd[i] = vs2[i+rs1] +vslidedown.vi vd, vs2, uimm, vm # vd[i] = vs2[i+uimm] + +vslide1up.vx vd, vs2, rs1 # vd[0]=x[rs1], vd[i+1]=vs2[i] +vslide1up.vi vd, vs2, uimm # vd[0]=x[uimm], vd[i+1]=vs2[i] + +vslide1down.vx vd, vs2, rs1 # vd[i]=vs2[i+1], vd[vl-1]=x[rs1] +vslide1down.vi vd, vs2, uimm # vd[i]=vs2[i+uimm], vd[vl-1]=x[uimm] + +Critical behaviors: + + - No operation if vstart ≥ vl + - Follows tail/mask policies + - Source/dest register groups cannot overlap + - OFFSET from x-reg or immediate + +Micro-ops to be generated: + + - For the vector `SLIDEUP` micro-op, we shall iterate over each element of the source register and update the destination register based on the offset and mask. Note that the lower elements of this destination register remain unchanged. + The computation would be as follows in pseudo code: + + ``` + for (int i = 0; i < vl; i++) { + if (mask[i]) { + // Note that the offset is either the register value (rs1) or the immediate value (uimm) + dest[i + offset] = src[i]; + } + } + ``` + + - For the vector `SLIDEDOWN` micro-op, we shall iterate over each element of the source register and update the destination register based on the offset and mask. All the upper elements fill in with zero. + The computation would be as follows in pseudo code: + + ``` + for (int i = 0; i < VLEN; i++) { + if (mask[i] && i < vl) { + dest[i] = src[i + offset]; + } else { + dest[i] = 0; + } + } + ``` + + - For the vector `SLIDE1UP` micro-op, we shall update the destination register based on the offset and mask. Note that we can reuse the `SLIDEUP` micro-op for this instruction by setting the offset to 1: + + ``` + dest[0] = rs1; + for (int i = 0; i < vl; i++) { + if (mask[i]) { + dest[i + 1] = src[i]; + } + } + ``` + + - For the vector `SLIDE1DOWN` micro-op, we shall update the destination register based on the offset and mask. Note that we can reuse the `SLIDEDOWN` micro-op for this instruction by setting the offset to 1: + + ``` + for (int i = 0; i < vl; i++) { + if (mask[i]) { + dest[i] = src[i + 1]; + } + } + // The upper elements of the destination register fill in with the register value + dest[vl - 1] = rs1; + ``` + +##### Vector Register Gather + + . vrgather.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; + . vrgatherei16.vv vd, vs2, vs1, vm # vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; + . vrgather.vx vd, vs2, rs1, vm # vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]] + . vrgather.vi vd, vs2, uimm, vm # vd[i] = (uimm >= VLMAX) ? 0 : vs2[uimm] + +Requirements: + + - Out-of-range indices return 0 + - No source/dest overlap allowed + - Handles different element widths + +- Micro-ops to be generated: `RGATHER` + +- For instructions for vector to vector gather, we iterate over each element of the source register and update the destination register based on the index and mask. For the case where the index is out of range, we set the destination element to 0. Following is the pseudo code for the same: + ``` + for (int i = 0; i < vl; i++) { + if (mask[i]) { + // Note that the index is either the register value (rs1) or the immediate value (uimm) + int index = (rs1 != 0) ? rs1[i] : uimm; + if (index >= VLMAX) { + dest[i] = 0; + } else { + dest[i] = src[index]; + } + } + } + ``` + +- For instructions with vx type gather, we iterate over each element of the source vector register, whose index is specified by the scalar register / immediate value and update the destination register based on this index and mask. For the case where the index is out of range, we set the destination element to 0. Following is the pseudo code for the same: + ``` + for (int i = 0; i < vl; i++) { + if (mask[i]) { + // Note that the index is either the register value (rs1) or the immediate value (uimm) + int index = (rs1 != 0) ? rs1[i] : uimm; + if (index >= VLMAX) { + dest[i] = 0; + } else { + dest[i] = src[index]; + } + } + } + ``` + +##### Vector Compress + + . vcompress.vm vd, vs2, vs1 # Pack masked elements contiguously + +Note that the vs1 acts as the vector mask register, which when enabled (set to 1) shall be used to contiguously pack the elements of vs2 into vd. + +Micro-ops to be generated: `COMPRESS` + +The micro-op for this instruction shall iterate over each element of the source register and update the destination register based on the mask. In this implementation, we shall use a register to store the next available index in the destination register (a free pointer). +The computation would be as follows in pseudo code: + +``` +int next_index = 0; +for (int i = 0; i < vl; i++) { + if (mask[i]) { + dest[next_index] = src[i]; + next_index++; + } +} +``` + + +##### Whole Vector Register Move + + . vmv1r.v v1, v2 # Copy v1=v2 + . vmv2r.v v10, v12 # Copy v10=v12; v11=v13 + . vmv4r.v v4, v8 # Copy v4=v8; v5=v9; v6=v10; v7=v11 + . vmv8r.v v0, v8 # Copy v0=v8; v1=v9; ...; v7=v15 + +- Micro-ops to be generated: `WHOLE_VECTOR_MOVE` + +The micro-op for this instruction shall move the complete source register to the destination register, with a number of registers specified by the instruction. +We shall decode the instruction, and specify the starting index and the number of registers to be moved in the micro-op. + +- Pseudo code for the micro-op: + +``` +for (int i = v_start; i < v_start + num_registers; i++) { + dest[i] = src[i]; +} +``` + +=== Overview Block Diagram + +== Block Diagram of the `SCALAR_MOVE` micro-op +== Block Diagram of the `SLIDE1UP` micro-op +== Block Diagram of the `SLIDE1DOWN` micro-op +== Block Diagram of the `SLIDEUP` micro-op +== Block Diagram of the `SLIDEDOWN` micro-op +== Block Diagram of the `RGATHER` micro-op +== Block Diagram of the `COMPRESS` micro-op +== Block Diagram of the `WHOLE_REG_MOVE` micro-op + + +[[Functional_Description]] +== Functional Description + +// +. TODO + +=== Taking an example of implementing the vector move instructions + +. vmv.x.s rd, vs2 # x[rd] = vs2[0] + +.. We add the instruction in `mavis/json/isa_rv64v.json` file + +.. Add the instruction to `core/InstArchInfo.hpp` file + +```cpp + enum class UopGenType + { + ... + SCALAR_MOVE + ... + } +``` + +.. Add a new function for SCALAR_MOVE and declare it in the header. + +```cpp + InstPtr generateScalarMoveUops_(); +``` + +.. Add the new function in the `core/vector/VectorUopGenerator.cpp` file. + +```cpp + + InstPtr VectorUopGenerator::generateScalarMoveUops_() + { + } +``` + +5. Add the tests to the `test/core/vector/Vector_test.cpp` file. + +[[Unit_Block_Diagram]] +=== Unit Block Diagram + +// +// image:media/image1.png[image,width=576,height=366] +// Figure 1 - Sample Figure +1. Vector Scalar Move Instruction + + +[[Block_Diagram_Description]] +=== Block Diagram Description + + +// + +// [[Description_of_Block_B1]] +// == Description of Block + +// +. TODO + +[[Operation]] +=== Operation + +// + +1. Vector Scalar Move Instruction + +. `vmv.x.s rd, vs2 # x[rd] = vs2[0] (vs1=0)` +- Performs its operation even if vstart ≥ vl or vl=0. +- If SEW > XLEN, the least-signi cant XLEN bits are transferred and the upper SEW-XLEN bits are ignored. +- If SEW < XLEN, the value is sign-extended to XLEN bits + +[[Interfaces]] +=== Interfaces + +// +. TODO + +[width="100%",cols="18%,21%,61%",options="header",] +|=== +|*Name* |*C++ Type* |*Purpose/Description* +| | | +| | | +| | | +|=== + +[[CPP_Class_Description]] +=== C++ Class Description + +//