Skip to content

Commit cd288df

Browse files
authored
Multi lingual (#20)
* support multi lingual nmt * fix typo Co-authored-by: wangxiaohui <wangxiaohui.neo@bytedance.com>
1 parent 8070745 commit cd288df

15 files changed

+840
-153
lines changed

docs/build.md

-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
- protobuf >= 3.13
66
- cmake >= 3.18
77

8-
There are submodules in this repository which you should clone with `--recurse-submodules`.
9-
108
To install cudatoolkit-dev, you could run `conda install -c conda-forge cudatoolkit-dev` or follow the [official guide](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#runfile), the runfile installation with `--toolkit` arg is recommended.
119

1210
After installation, check the installation of `nvcc` and static libraries (*.a) in `${CUDA_PATH}/lib64`.

example/transformer_example.cc.cu

+15-9
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,13 @@ Example of how to run transformer inference using our implementation.
1010
*/
1111

1212
// Appoint precision.
13-
const lightseq::cuda::OperationType optype =
13+
#ifdef FP16_MODE
14+
const lightseq::cuda::OperationType OPTYPE =
15+
lightseq::cuda::OperationType::FP16;
16+
#else
17+
const lightseq::cuda::OperationType OPTYPE =
1418
lightseq::cuda::OperationType::FP32;
19+
#endif
1520

1621
int main(int argc, char *argv[]) {
1722
/* ---step1. init environment--- */
@@ -21,10 +26,10 @@ int main(int argc, char *argv[]) {
2126
cudaStreamCreate(&stream_);
2227
cublasCreate(&hd_);
2328
cublasSetStream(hd_, stream_);
24-
typedef lightseq::cuda::OperationTypeTraits<optype> optraits;
29+
typedef lightseq::cuda::OperationTypeTraits<OPTYPE> optraits;
2530

2631
/* ---step2. load model weights into GPU memory--- */
27-
lightseq::cuda::TransformerWeight<optype> tw_;
32+
lightseq::cuda::TransformerWeight<OPTYPE> tw_;
2833
// saved in custom proto file
2934
std::string model_weights_path = argv[1];
3035
std::string res = tw_.initializing(model_weights_path);
@@ -47,8 +52,8 @@ int main(int argc, char *argv[]) {
4752
std::vector<int>(max_batch_size * tw_._max_step * tw_._hidden_size, 0);
4853
thrust::device_vector<int> d_output_ =
4954
std::vector<int>(max_batch_size * tw_._max_step, 0);
50-
std::shared_ptr<lightseq::cuda::Encoder<optype>> encoder_ =
51-
std::make_shared<lightseq::cuda::Encoder<optype>>(
55+
std::shared_ptr<lightseq::cuda::Encoder<OPTYPE>> encoder_ =
56+
std::make_shared<lightseq::cuda::Encoder<OPTYPE>>(
5257
max_batch_size,
5358
reinterpret_cast<int *>(thrust::raw_pointer_cast(d_input_.data())),
5459
reinterpret_cast<int *>(
@@ -62,15 +67,16 @@ int main(int argc, char *argv[]) {
6267
return 1;
6368
}
6469
// instantiate decoder
65-
std::shared_ptr<lightseq::cuda::Decoder<optype>> decoder_ =
66-
std::make_shared<lightseq::cuda::Decoder<optype>>(
70+
std::shared_ptr<lightseq::cuda::Decoder<OPTYPE>> decoder_ =
71+
std::make_shared<lightseq::cuda::Decoder<OPTYPE>>(
6772
max_batch_size,
6873
reinterpret_cast<int *>(
6974
thrust::raw_pointer_cast(d_padding_mask_.data())),
7075
reinterpret_cast<optraits::DataType *>(
7176
thrust::raw_pointer_cast(d_encoder_output_.data())),
7277
reinterpret_cast<int *>(thrust::raw_pointer_cast(d_output_.data())),
73-
tw_, stream_, hd_);
78+
tw_, stream_, hd_, false,
79+
reinterpret_cast<int *>(thrust::raw_pointer_cast(d_input_.data())));
7480
res = decoder_->check();
7581
if (!res.empty()) {
7682
std::cout << res << std::endl;
@@ -104,7 +110,7 @@ int main(int argc, char *argv[]) {
104110
batch_seq_len, host_input);
105111

106112
/* ---step5. infer and log--- */
107-
for (int i = 0; i < 10; i++) {
113+
for (int i = 0; i < 1; i++) {
108114
auto start = std::chrono::high_resolution_clock::now();
109115
// copy inputs from cpu memory to gpu memory
110116
cudaMemcpyAsync(

kernels/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cmake_minimum_required(VERSION 3.18)
22

3-
set(cuda_kernel_files gptKernels.cc.cu transformerKernels.cc.cu)
3+
set(cuda_kernel_files gptKernels.cc.cu transformerKernels.cc.cu multilgKernels.cc.cu)
44

55
add_library(cuda_kernels STATIC ${cuda_kernel_files})
66
target_include_directories(cuda_kernels INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})

kernels/gptKernels.h

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ void ker_gpt_embedding_launcher(int batch_size, int batch_seq_len,
1717
int pos_offset);
1818

1919

20+
2021
template <typename T>
2122
void ker_correlation_softmax_gpt_launcher(int batch_size, int batch_seq_len,
2223
int head_num, cudaStream_t stream,

0 commit comments

Comments
 (0)