Skip to content

Commit 7f358b9

Browse files
wangchang-2020Zars19jackzipu
authored
cherry-pick master 代码到SDK2.2.2 分支上 (#559)
* Add Custom Op for Yolov3 Post Process (#512) * add custom op for yolov3 * reset submodule onnx * reset tensorrt * delete build * merge odla_ops_nn * modify for passing link-check Co-authored-by: gcuser <jackz@graphcore.ai> (cherry picked from commit 5847cd3) * ODLA popART pipeline function (#522) * First runnable with single thread & test context * mnist runnable demot to test the pipeline * multi thread put the data to the session run * simple bash to compile and run test * An example of how to use the callback in pipeline * multi threads using local Ctx * Can run with pipeline setting in onnx file * Refactored and add no pipeline multi thread * Move codes to the odla_pipeline.h .cc * Make single empty/zero data, and delete context for empty data after get result * Add mutex to serialization the compute requests * Merge the changes for attention mask & prevous changes * test codes for time * Chage the CMakeList to make the pipeline.cc and new custom op compiled * Successfully run on 24L with attention mask custom OP * custom op attention_mask test code * And name scope to the each node in model * Try throghput test with MLPerf model * only set AMP on feed forward matmul * Run the online pipeling with config hard coded to the config read class * Compile with SDK 2.2 with pipeline online setting * Add config file for pipeline stage setting * Run pipeline with similar performance of popart * change some names & make AMP all 0.445 * Add amp parameter in config file * Detach device and clear session when DestroyComputation * Make the batch_per_step take effect on execution mode SEQUENCE to pass enough size of data * Add the new lock free queue and logging * Fix bug on empty data visit counter * delete the empty context * add some pipeline sync * Make thread sleep for 5 ms when no task in the queue * change the size() of LockFreeQueue to tail-wait * [CI] make the call by main can work with npz files * Move the computation init to create context * Add common functions to common.h and common.cc * move the compuation init out * Move common functions to the test foler * Test the config of ODLA popART and make no configuration act as before * Add tests for call the model.cc * Add FP32 to save as result * Some changes on LockFreeQueue and tests * Fix the rsqrt wrong problem, and remove std cout&cerr to avoid crash * fix the accuracy problem of large bps * Add thread check for context & computation holding to avoid conflicts * Add the batch tools to help on the test to generate model, build and run * Decreasing the empty data put * temporary commit to migrate crashed system * set pipeline information on fly change the mixed style of class member add debug setting and default to false to make the opts set by api remove the old pipeline set api * Fixed the mixed code style and removed redundant codes * Remove the function test codes of the odla_popart * remove some redundant codes and files * Changed the CACHE STRING to CACHE PATH * move ENGINE_CACHE_PATH to odla_popart.cc * format the codes with clang-format-9 -i command * Move json.hpp to third party * Set virtualgraph for model not using pipeline in set_session_opts * Add virtual graph attribute when _odla_computation constructed * Check the shape before extends it with batches_per_step Co-authored-by: gcuser <gcuser@alibaba-inc.com> (cherry picked from commit 6095bdf) * fix on default configuration & computation destroyment (cherry picked from commit 40b9fc8) * definitions for static variables (cherry picked from commit 18e0e83) * disable test case test_constant_popart.cc Co-authored-by: Zars19 <1036473307@qq.com> Co-authored-by: jackzipu <74961298+jackzipu@users.noreply.github.com> Co-authored-by: gcuser <jackz@graphcore.ai>
1 parent 0810ace commit 7f358b9

33 files changed

+28498
-474
lines changed

ODLA/include/ODLA/ops/odla_ops_nn.h

+17
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,23 @@ extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_TopK(
530530
odla_uint32 axis, odla_value_type output_value_type,
531531
odla_value_type output_value_index_type, const odla_value_ids value_ids);
532532

533+
//! \brief Yolov3 Post Process
534+
/*!
535+
PostProcess Return Selected Info (cx, cy, w, h, pred_cls) of Each Class
536+
537+
\param orig_img_w the width of original image
538+
\param orig_img_h the height of original image
539+
\param bb13 BBoxes 13 x 13
540+
\param bb26 BBoxes 26 x 26
541+
\param bb52 BBoxes 52 x 52
542+
\param value_id a unique value id (can be NULL)
543+
544+
\return odla_values
545+
*/
546+
extern ODLA_API_EXPORT odla_values ODLA_API_CALL odla_PostProcess(
547+
odla_value orig_img_w, odla_value orig_img_h, odla_value bb13,
548+
odla_value bb26, odla_value bb52, const odla_value_id value_id);
549+
533550
#ifdef __cplusplus
534551
} // C extern
535552
#endif

ODLA/platforms/odla_popart/CMakeLists.txt

+8-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ option(ODLA_BUILD_POPART_CUSTOM_OPS "Link with Popart custom ops" ON)
2020
add_odla_library(odla_popart SHARED common.cc odla_compute.cc
2121
odla_ops_math.cc odla_ops_nn.cc
2222
odla_ops_process.cc odla_ops.cc
23-
odla_ops_custom.cc
24-
)
23+
odla_ops_custom.cc odla_pipeline.cc
24+
odla_popart.cc popart_config.cc
25+
)
2526

2627
if (NOT POPLAR_ROOT)
2728
set(POPLAR_ROOT "/opt/poplar_sdk/poplar" CACHE PATH "Path of poplar root")
@@ -51,3 +52,8 @@ if (NOT ODLA_BUILD_POPART_USE_CXX11ABI)
5152
endif()
5253

5354
target_link_libraries(odla_popart PUBLIC ODLA custom_ops popart-only)
55+
56+
target_include_directories(odla_popart PRIVATE
57+
${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/onnx/
58+
${CMAKE_CURRENT_SOURCE_DIR}/custom_ops/third_party/include/
59+
)
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"version":"1.0.0",
3+
"amp":0.445,
4+
"batch_per_step":10,
5+
"execution_mode":"pipeline",
6+
"ipu_num":2,
7+
"load_onnx":false,
8+
"load_onnx_path":"path",
9+
"pipeline":{
10+
"^embedding_" : [0, 0],
11+
"^layer[0-9]_" : [0, 0],
12+
"^layer1[0-1]_" : [0, 0],
13+
"^layer1[2-9]_" : [1, 1],
14+
"^layer2[0-3]_" : [1, 1],
15+
"^squad_" : [1, 1]
16+
},
17+
"queue_type":"LockFreeQueue",
18+
"queue_capacity":1048576,
19+
"save_model" : true,
20+
"save_model_path":"pipeline_test.onnx"
21+
}

ODLA/platforms/odla_popart/custom_ops/CMakeLists.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
add_odla_library(custom_ops SHARED
1818
erf.cc
1919
rsqrt.cc
20+
postprocess.cc
21+
attention_mask.cc
2022
)
2123

2224
set_property(TARGET custom_ops PROPERTY CXX_STANDARD 14)
@@ -31,4 +33,4 @@ target_link_libraries(custom_ops PRIVATE popart-only)
3133
target_include_directories(custom_ops PRIVATE
3234
${CMAKE_CURRENT_SOURCE_DIR}/third_party/onnx/
3335
${CMAKE_CURRENT_SOURCE_DIR}/third_party/include/
34-
)
36+
)
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,31 @@
11
CXX ?= g++
2-
CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx -D_GLIBCXX_USE_CXX11_ABI=0
2+
CXXFLAGS = -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx
33
LDLIBS = -shared -lpopart -lpoplar -lpopops -lpoputil
4-
INCLUDES = -Iinclude
4+
INCLUDES = -Iinclude -Ithird_party/onnx/ -Ithird_party/include
55

66
BUILD_DIR = build
7-
SOURCES = rsqrt.cc erf.cc
7+
SOURCES = rsqrt.cc erf.cc postprocess.cc attention_mask.cc
88
TARGET = $(BUILD_DIR)/libcustom_ops.so
99

10-
all: create_build_dir rsqrt_custom_op rsqrt_test erf_test
10+
all: create_build_dir rsqrt_custom_op rsqrt_test attention_mask_test
1111

1212
.PHONY: create_build_dir
1313
create_build_dir:
1414
mkdir -p $(BUILD_DIR)
1515

16-
rsqrt_custom_op: rsqrt.cc erf.cc
16+
rsqrt_custom_op: ${SOURCES}
1717
$(CXX) $(SOURCES) $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o $(TARGET)
1818

1919
rsqrt_test: rsqrt_test.cc rsqrt_custom_op
20-
$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test -D_GLIBCXX_USE_CXX11_ABI=0
20+
$(CXX) -std=c++14 rsqrt_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o rsqrt_test
2121

22-
erf_test: erf_test.cc rsqrt_custom_op
23-
$(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test -D_GLIBCXX_USE_CXX11_ABI=0
22+
#erf_test: erf_test.cc rsqrt_custom_op
23+
# $(CXX) -std=c++14 erf_test.cc -lpopart -lpoplar -lpopops -ldl -DONNX_NAMESPACE=onnx -o erf_test
24+
25+
attention_mask_test: attention_mask_test.cc rsqrt_custom_op
26+
# $(CXX) $(LDLIBS) $(CXXFLAGS) $(INCLUDES) -o attention_mask_test
27+
$(CXX) -std=c++14 -fPIC -g -DONNX_NAMESPACE=onnx attention_mask_test.cc -lpopart -lpoplar -lpopops -ldl -o attention_mask_test
2428

2529
.PHONY: clean
2630
clean:
27-
rm -r $(BUILD_DIR) rsqrt_test erf_test
31+
rm -r $(BUILD_DIR) rsqrt_test attention_mask_test
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
// Copyright (c) 2019 Graphcore Ltd. All rights reserved.
2+
3+
#include <iostream>
4+
#include <popart/names.hpp>
5+
#include <popart/op.hpp>
6+
#include <popart/opmanager.hpp>
7+
#include <popart/popx/devicex.hpp>
8+
#include <popart/popx/opx.hpp>
9+
#include <popart/popx/opxmanager.hpp>
10+
#include <popart/region.hpp>
11+
#include <popart/shapeinference.hpp>
12+
#include <popops/Cast.hpp>
13+
#include <popops/ElementWise.hpp>
14+
#include <popops/Rearrange.hpp>
15+
#include <poputil/TileMapping.hpp>
16+
#include <random>
17+
18+
using namespace popart;
19+
using namespace popart::popx;
20+
using namespace popops::expr;
21+
22+
namespace CustomOperators {
23+
const popart::OperatorIdentifier AttentionMask = {"ai.graphcore",
24+
"AttentionMask", 1};
25+
} // namespace CustomOperators
26+
27+
// An InplaceIdentityOp that doesn't return any grad ops. This allows you to
28+
// disconnect the flow of gradients when creating the backwards pass
29+
class AttentionMaskOp : public popart::Op {
30+
public:
31+
poplar::Type dataType;
32+
33+
AttentionMaskOp(const popart::OperatorIdentifier& _opid,
34+
const Op::Settings& settings_, poplar::Type& dataTypeIn)
35+
: Op(_opid, settings_), dataType(dataTypeIn) {}
36+
37+
void setup() final {
38+
// input shape [B, S]
39+
Shape inShape = inInfo(0).shape();
40+
Shape refShape = inInfo(1).shape();
41+
42+
// output shape [B, 1, S, S]
43+
Shape outShape = {inShape.at(0), 1, inShape.at(1), inShape.at(1)};
44+
45+
if (dataType == poplar::HALF)
46+
outInfo(0) = {"FLOAT16", outShape};
47+
else
48+
outInfo(0) = {"FLOAT", outShape};
49+
}
50+
51+
std::unique_ptr<Op> clone() const final {
52+
return std::make_unique<AttentionMaskOp>(*this);
53+
}
54+
55+
float getSubgraphValue() const final { return getLowSubgraphValue(); }
56+
};
57+
58+
static popart::OpDefinition attentionMaskOpDef({});
59+
60+
static popart::OpCreator<AttentionMaskOp> attentionMaskOpCreator(
61+
popart::OpDefinitions({{CustomOperators::AttentionMask,
62+
attentionMaskOpDef}}),
63+
[](const popart::OpCreatorInfo& oci) -> std::unique_ptr<popart::Op> {
64+
std::string type =
65+
oci.attributes.getAttribute<Attributes::String>("dataType");
66+
poplar::Type dataType = (type == "FLOAT") ? poplar::FLOAT : poplar::HALF;
67+
68+
return std::unique_ptr<AttentionMaskOp>(
69+
new AttentionMaskOp(oci.opid, oci.settings, dataType));
70+
},
71+
true);
72+
73+
class AttentionMaskOpX : public popart::popx::Opx {
74+
public:
75+
AttentionMaskOpX(popart::Op* op, popart::popx::Devicex* devicex)
76+
: popart::popx::Opx(op, devicex) {
77+
verifyOp<AttentionMaskOp>(op, CustomOperators::AttentionMask);
78+
}
79+
80+
popart::popx::InputCreatorType getInputCreatorType(popart::InIndex) const {
81+
return popart::popx::InputCreatorType::CanUnwind;
82+
}
83+
84+
poplar::Tensor unwindTensorLayout(poplar::Tensor tensor, popart::InIndex,
85+
popart::OutIndex) const {
86+
return tensor;
87+
}
88+
89+
popart::view::RegMap unwindRegion(popart::InIndex, popart::OutIndex) const {
90+
return [this](const popart::view::Region& r) {
91+
return popart::view::Regions(1, r);
92+
};
93+
}
94+
95+
void grow(poplar::program::Sequence& prog) const final {
96+
AttentionMaskOp& myOp = getOp<AttentionMaskOp>();
97+
98+
poplar::Type dataType = myOp.dataType;
99+
poplar::Graph& graph = Opx::graph();
100+
// input tensor shape [B, S]
101+
poplar::Tensor seqIndex = getInTensor(0);
102+
std::size_t batchSize = seqIndex.dim(0);
103+
std::size_t seqLength = seqIndex.dim(1);
104+
seqIndex = seqIndex.reshape({batchSize, seqLength, 1});
105+
seqIndex = popops::cast(graph, seqIndex, dataType, prog, "input_mask_f");
106+
poplar::Tensor attentionMatrix = getInTensor(1);
107+
108+
const auto dimOrdering =
109+
poputil::detectDimGroupings(graph, attentionMatrix);
110+
bool swapOrder = !dimOrdering.empty() && dimOrdering.front().first == 2;
111+
auto seqMask =
112+
swapOrder ? popops::sub(graph, seqIndex.dimShuffle({0, 2, 1}), seqIndex,
113+
prog, "maskVal")
114+
.dimShuffle({0, 2, 1})
115+
: popops::sub(graph, seqIndex, seqIndex.dimShuffle({0, 2, 1}),
116+
prog, "maskVal");
117+
popops::absInPlace(graph, seqMask, prog);
118+
popops::tanhInPlace(graph, seqMask, prog);
119+
120+
// Create constant tensor;
121+
std::mt19937 randomEngine;
122+
unsigned totalTile = graph.getTarget().getTilesPerIPU();
123+
std::uniform_int_distribution<> distrib(0, totalTile - 1);
124+
int tileForConst = distrib(randomEngine);
125+
poplar::Tensor minValue = graph.addConstant(dataType, {}, -10000.0);
126+
graph.setTileMapping(minValue, tileForConst);
127+
128+
// Create log mask
129+
popops::mulInPlace(graph, seqMask, minValue, prog);
130+
seqMask = seqMask.reshape({batchSize, 1, seqLength, seqLength});
131+
setOutTensor(0, seqMask);
132+
}
133+
};
134+
135+
static popart::popx::OpxCreator<AttentionMaskOpX> attentionMaskOpxCreator(
136+
CustomOperators::AttentionMask);
137+
138+
static popart::RegisterShapeInferenceFunction AttentionMaskShapeInfer(
139+
CustomOperators::AttentionMask, [](ShapeInferenceContext& ctx) {
140+
auto B = ctx.inInfo(1).shape().at(0);
141+
auto S = ctx.inInfo(1).shape().at(3);
142+
auto dtype = ctx.inInfo(1).data_type();
143+
ctx.outInfo(0) = {dtype, Shape({B, 1, S, S})};
144+
});

0 commit comments

Comments
 (0)