1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
use crate::constants;
use crate::error::{ErrorKind, Result};
use numa_gpu::runtime::memory::DerefMem;
use rustacuda::memory::DeviceCopy;
use std::mem;
pub(super) fn input_chunk_size<Key>(data_len: usize, num_chunks: u32) -> Result<usize> {
let num_chunks_usize = num_chunks as usize;
let input_align_mask = !(constants::ALIGN_BYTES as usize / mem::size_of::<Key>() - 1);
let chunk_len = ((data_len + num_chunks_usize - 1) / num_chunks_usize) & input_align_mask;
if chunk_len >= std::u32::MAX as usize {
let msg = "Relation is too large and causes an integer overflow. Try using more chunks by setting a higher CUDA grid size";
Err(ErrorKind::IntegerOverflow(msg.to_string()))?
};
Ok(chunk_len)
}
pub trait RadixPartitionInputChunkable {
type Out;
fn input_chunks<'a, Key>(
&'a self,
num_chunks: u32,
) -> Result<Vec<RadixPartitionInputChunk<'a, Self::Out>>>;
}
#[derive(Clone, Debug)]
pub struct RadixPartitionInputChunk<'a, T: Sized> {
pub data: &'a [T],
pub canonical_chunk_len: usize,
pub chunk_id: u32,
pub num_chunks: u32,
pub total_data_len: usize,
}
impl<T: Sized> RadixPartitionInputChunkable for [T] {
type Out = T;
fn input_chunks<Key>(
&self,
num_chunks: u32,
) -> Result<Vec<RadixPartitionInputChunk<'_, Self::Out>>> {
let canonical_chunk_len = input_chunk_size::<Key>(self.len(), num_chunks)?;
let chunks = (0..num_chunks)
.map(|chunk_id| {
let offset = canonical_chunk_len * chunk_id as usize;
let actual_chunk_len = if chunk_id + 1 == num_chunks {
self.len() - offset
} else {
canonical_chunk_len
};
let data = &self[offset..(offset + actual_chunk_len)];
RadixPartitionInputChunk {
data,
canonical_chunk_len,
chunk_id,
num_chunks,
total_data_len: self.len(),
}
})
.collect();
Ok(chunks)
}
}
impl<T: Sized + DeviceCopy> RadixPartitionInputChunkable for DerefMem<T> {
type Out = T;
fn input_chunks<Key>(
&self,
num_chunks: u32,
) -> Result<Vec<RadixPartitionInputChunk<'_, Self::Out>>> {
self.as_slice().input_chunks::<Key>(num_chunks)
}
}