pub fn gpu_radix_join<T>(
    data: &mut JoinData<T>, 
    hashing_scheme: HashingScheme, 
    histogram_algorithm_fst: DeviceType<CpuHistogramAlgorithm, GpuHistogramAlgorithm>, 
    histogram_algorithm_snd: DeviceType<CpuHistogramAlgorithm, GpuHistogramAlgorithm>, 
    partition_algorithm_fst: DeviceType<CpuRadixPartitionAlgorithm, GpuRadixPartitionAlgorithm>, 
    partition_algorithm_snd: DeviceType<CpuRadixPartitionAlgorithm, GpuRadixPartitionAlgorithm>, 
    radix_bits: &RadixBits, 
    dmem_buffer_bytes: usize, 
    _max_partitions_cache_bytes: Option<usize>, 
    threads: usize, 
    cpu_affinity: CpuAffinity, 
    partitions_mem_type: MemType, 
    stream_state_mem_type: MemType, 
    _page_type: PageType, 
    partition_dim: (&GridSize, &BlockSize), 
    join_dim: (&GridSize, &BlockSize)
) -> Result<(i64, RadixJoinPoint)> where
    T: Default + Clone + DeviceCopy + Sync + Send + CpuRadixPartitionable + GpuRadixPartitionable + KeyAttribute + CudaHashJoinable + CpuHashJoinable + CudaRadixJoinable,