1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
use super::data_point::DataPoint;
use super::{Benchmark, ItemBytes, MemoryOperation};
use crate::types::{Cycles, ThreadCount};
use itertools::{iproduct, izip};
use numa_gpu::runtime::cpu_affinity::CpuAffinity;
use numa_gpu::runtime::memory::DerefMem;
use std::iter;
use std::rc::Rc;
use std::sync::Arc;
pub(super) struct CpuMeasurement {
threads: Vec<ThreadCount>,
cpu_affinity: Arc<CpuAffinity>,
template: DataPoint,
}
impl CpuMeasurement {
pub(super) fn new(
threads: Vec<ThreadCount>,
cpu_affinity: CpuAffinity,
template: DataPoint,
) -> Self {
let cpu_affinity = Arc::new(cpu_affinity);
Self {
threads,
cpu_affinity,
template,
}
}
pub(super) fn measure<R, S>(
&self,
mem: &DerefMem<u32>,
mut state: S,
run: R,
benches: Vec<Benchmark>,
ops: Vec<MemoryOperation>,
item_bytes: Vec<ItemBytes>,
repeat: u32,
) -> Vec<DataPoint>
where
R: Fn(
Benchmark,
MemoryOperation,
ItemBytes,
&mut S,
&DerefMem<u32>,
Rc<rayon::ThreadPool>,
) -> (u32, u64, Cycles, u64),
{
let cpu_affinity = &self.cpu_affinity;
let data_points: Vec<_> = iproduct!(
iproduct!(
benches.iter(),
ops.iter(),
item_bytes.iter(),
self.threads.iter().map(|&ThreadCount(t)| {
let cpu_affinity = cpu_affinity.clone();
let thread_pool = Rc::new(
rayon::ThreadPoolBuilder::new()
.num_threads(t)
.start_handler(move |tid| {
cpu_affinity
.clone()
.set_affinity(tid as u16)
.expect("Couldn't set CPU core affinity")
})
.build()
.expect("Couldn't build Rayon thread pool"),
);
thread_pool
})
),
izip!(iter::once(true).chain(iter::repeat(false)), 0..repeat)
)
.map(
|((&bench, &op, &item_bytes, thread_pool), (warm_up, _run_number))| {
let threads = ThreadCount(thread_pool.current_num_threads());
let (clock_rate_mhz, memory_accesses, cycles, ns) =
run(bench, op, item_bytes, &mut state, mem, thread_pool);
DataPoint {
benchmark: Some(bench),
memory_operation: Some(op),
item_bytes: Some(item_bytes),
warm_up,
threads: Some(threads),
throttle_reasons: None,
clock_rate_mhz: Some(clock_rate_mhz),
memory_accesses,
cycles,
ns,
..self.template.clone()
}
},
)
.collect();
data_points
}
}