Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix clang format conflicts un unchecked policy branch #1789

Merged
merged 26 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
506996a
Refactor some hip policies to avoid duplicated code
MrBurmark Aug 24, 2024
f08fddc
Add an unchecked iteration_mapping
MrBurmark Aug 24, 2024
dafde16
Add hip unchecked policies
MrBurmark Aug 24, 2024
b810592
Add hip launch unchecked implementations
MrBurmark Aug 24, 2024
2a315f4
Add support for hip unchecked in kernel
MrBurmark Aug 25, 2024
a4741c7
Add hip warp unchecked policy
MrBurmark Aug 25, 2024
31f0744
Add unchecked policies to docs
MrBurmark Aug 25, 2024
7cad03b
Add cuda explicit implementation
MrBurmark Aug 26, 2024
30847df
Regularize spacing in hip/cuda
MrBurmark Aug 26, 2024
c0ac18e
Fill out 2d and 3d loop and tile implementations
MrBurmark Aug 26, 2024
fac8dec
Add testing for unchecked policies
MrBurmark Sep 12, 2024
e7a7273
Fix launch nested Tile tests
MrBurmark Sep 12, 2024
c4b466b
simplify launch testing and add waits
MrBurmark Sep 13, 2024
8680495
Rename from unchecked to direct_unchecked
MrBurmark Dec 30, 2024
7bf16ba
rename test files
MrBurmark Dec 30, 2024
3128f2f
Run clang-format on the code the reduce merge conflicts for PR #1778
rhornung67 Jan 31, 2025
ab6bc55
Merge branch 'develop' into task/rhornung/clang-format-conflicts
rhornung67 Feb 4, 2025
47ae5f6
Fix some merge conflicts
MrBurmark Feb 6, 2025
2398308
merge cuda policies
MrBurmark Feb 6, 2025
20e34e6
merge hip policies
MrBurmark Feb 6, 2025
c0059fc
Merge branch 'feature/burmark1/kernelNoWorkNoLaunch' into task/rhornu…
MrBurmark Feb 11, 2025
5069939
fixup direct_unchecked calculateDimensions
MrBurmark Feb 11, 2025
d09d161
fixup KernelDimensionCalculator for direct_unchecked
MrBurmark Feb 11, 2025
d4667a3
Fix launch when no work while keeping old behavior
MrBurmark Feb 11, 2025
da77e6a
Merge branch 'develop' of github.com:LLNL/RAJA into task/rhornung/cla…
MrBurmark Feb 14, 2025
3c71080
fix formatting
MrBurmark Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 165 additions & 36 deletions docs/sphinx/user_guide/feature/policies.rst

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions include/RAJA/pattern/launch/launch_core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,21 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
body);
}

RAJA_SUPPRESS_HD_WARN
template<typename POLICY_LIST,
typename CONTEXT,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
SEGMENT const& segment0,
SEGMENT const& segment1,
BODY const& body)
{

LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0,
segment1, body);
}

RAJA_SUPPRESS_HD_WARN
template<typename POLICY_LIST,
typename CONTEXT,
Expand Down Expand Up @@ -796,6 +811,46 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
ctx, tile_size0, tile_size1, segment0, segment1, body);
}

template<typename POLICY_LIST,
typename CONTEXT,
typename TILE_T,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
TILE_T tile_size0,
TILE_T tile_size1,
TILE_T tile_size2,
SEGMENT const& segment0,
SEGMENT const& segment1,
SEGMENT const& segment2,
BODY const& body)
{

TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
ctx, tile_size0, tile_size1, tile_size2, segment0, segment1, segment2,
body);
}

template<typename POLICY_LIST,
typename CONTEXT,
typename TILE_T,
typename SEGMENT,
typename BODY>
RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
TILE_T tile_size0,
TILE_T tile_size1,
TILE_T tile_size2,
SEGMENT const& segment0,
SEGMENT const& segment1,
SEGMENT const& segment2,
BODY const& body)
{

TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
ctx, tile_size0, tile_size1, tile_size2, segment0, segment1, segment2,
body);
}

} // namespace expt

} // namespace RAJA
Expand Down
59 changes: 59 additions & 0 deletions include/RAJA/policy/cuda/kernel/For.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,65 @@ namespace RAJA
namespace internal
{

/*
* Executor for work sharing inside CudaKernel.
* Mapping without checking from IndexMapper to indices
* Assigns the loop index to offset ArgumentId
* Meets all sync requirements
*/
template<typename Data,
camp::idx_t ArgumentId,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::For<
ArgumentId,
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
{

using stmt_list_t = StatementList<EnclosedStmts...>;

// Set the argument type for this loop
using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;

using enclosed_stmts_t =
CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;

using diff_t = segment_diff_type<ArgumentId, Data>;

using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;

static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
{
const diff_t i = IndexMapper::template index<diff_t>();

// Assign the index to the argument
data.template assign_offset<ArgumentId>(i);

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);
}

static inline LaunchDims calculateDimensions(Data const& data)
{
const diff_t len = segment_length<ArgumentId>(data);

LaunchDims dims = DimensionCalculator::get_dimensions(len);

LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);

return combine(dims, enclosed_dims);
}
};

/*
* Executor for work sharing inside CudaKernel.
* Mapping directly from IndexMapper to indices
Expand Down
61 changes: 61 additions & 0 deletions include/RAJA/policy/cuda/kernel/ForICount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,67 @@ namespace RAJA
namespace internal
{

/*
* Executor for work sharing inside CudaKernel.
* Provides a direct unchecked mapping.
* Assigns the loop index to offset ArgumentId
* Assigns the loop index to param ParamId
* Meets all sync requirements
*/
template<typename Data,
camp::idx_t ArgumentId,
typename ParamId,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::ForICount<
ArgumentId,
ParamId,
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
: CudaStatementExecutor<
Data,
statement::For<ArgumentId,
RAJA::policy::cuda::cuda_indexer<
iteration_mapping::DirectUnchecked,
sync,
IndexMapper>,
EnclosedStmts...>,
Types>
{

using Base = CudaStatementExecutor<
Data,
statement::For<
ArgumentId,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
sync,
IndexMapper>,
EnclosedStmts...>,
Types>;

using typename Base::diff_t;
using typename Base::enclosed_stmts_t;

static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
{
// grid stride loop
const diff_t i = IndexMapper::template index<diff_t>();

// Assign the index to the argument and param
data.template assign_offset<ArgumentId>(i);
data.template assign_param<ParamId>(i);

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);
}
};

/*
* Executor for work sharing inside CudaKernel.
* Provides a direct mapping.
Expand Down
86 changes: 86 additions & 0 deletions include/RAJA/policy/cuda/kernel/Tile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,92 @@ namespace RAJA
namespace internal
{

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::Tile
* Assigns the tile segment to segment ArgumentId
* Meets all sync requirements
*/
template<typename Data,
camp::idx_t ArgumentId,
camp::idx_t chunk_size,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::Tile<
ArgumentId,
RAJA::tile_fixed<chunk_size>,
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
{

using stmt_list_t = StatementList<EnclosedStmts...>;

using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;

using diff_t = segment_diff_type<ArgumentId, Data>;

using DimensionCalculator = KernelDimensionCalculator<
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>>;

static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
{
// Get the segment referenced by this Tile statement
auto& segment = camp::get<ArgumentId>(data.segment_tuple);

using segment_t = camp::decay<decltype(segment)>;

// compute trip count
const diff_t i =
IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);

// Keep copy of original segment, so we can restore it
segment_t orig_segment = segment;

// Assign our new tiled segment
segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);

// Set range back to original values
segment = orig_segment;
}

static inline LaunchDims calculateDimensions(Data const& data)
{
// Compute how many chunks
const diff_t full_len = segment_length<ArgumentId>(data);
const diff_t len =
RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));

LaunchDims dims = DimensionCalculator::get_dimensions(len);

// privatize data, so we can mess with the segments
using data_t = camp::decay<Data>;
data_t private_data = data;

// Get original segment
auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);

// restrict to first tile
segment = segment.slice(0, static_cast<diff_t>(chunk_size));

// NOTE: We do not detect improper uses of direct_unchecked policies under
// tiling. This happens when using a direct unchecked policy on a tiled
// range that is not evenly divisible by chunk_size.
LaunchDims enclosed_dims =
enclosed_stmts_t::calculateDimensions(private_data);

return combine(dims, enclosed_dims);
}
};

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::Tile
* Assigns the tile segment to segment ArgumentId
Expand Down
76 changes: 76 additions & 0 deletions include/RAJA/policy/cuda/kernel/TileTCount.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,82 @@ namespace RAJA
namespace internal
{

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
* Assigns the tile segment to segment ArgumentId
* Assigns the tile index to param ParamId
* Meets all sync requirements
*/
template<typename Data,
camp::idx_t ArgumentId,
typename ParamId,
camp::idx_t chunk_size,
typename IndexMapper,
kernel_sync_requirement sync,
typename... EnclosedStmts,
typename Types>
struct CudaStatementExecutor<
Data,
statement::TileTCount<
ArgumentId,
ParamId,
RAJA::tile_fixed<chunk_size>,
RAJA::policy::cuda::
cuda_indexer<iteration_mapping::DirectUnchecked, sync, IndexMapper>,
EnclosedStmts...>,
Types>
: public CudaStatementExecutor<
Data,
statement::Tile<ArgumentId,
RAJA::tile_fixed<chunk_size>,
RAJA::policy::cuda::cuda_indexer<
iteration_mapping::DirectUnchecked,
sync,
IndexMapper>,
EnclosedStmts...>,
Types>
{

using Base = CudaStatementExecutor<
Data,
statement::Tile<
ArgumentId,
RAJA::tile_fixed<chunk_size>,
RAJA::policy::cuda::cuda_indexer<iteration_mapping::DirectUnchecked,
sync,
IndexMapper>,
EnclosedStmts...>,
Types>;

using typename Base::diff_t;
using typename Base::enclosed_stmts_t;

static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
{
// Get the segment referenced by this Tile statement
auto& segment = camp::get<ArgumentId>(data.segment_tuple);

using segment_t = camp::decay<decltype(segment)>;

// compute trip count
const diff_t t = IndexMapper::template index<diff_t>();
const diff_t i = t * static_cast<diff_t>(chunk_size);

// Keep copy of original segment, so we can restore it
segment_t orig_segment = segment;

// Assign our new tiled segment
segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
data.template assign_param<ParamId>(t);

// execute enclosed statements
enclosed_stmts_t::exec(data, thread_active);

// Set range back to original values
segment = orig_segment;
}
};

/*!
* A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
* Assigns the tile segment to segment ArgumentId
Expand Down
Loading