Table Batched Embedding 運算子¶
-
std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>> get_unique_indices_cuda(const at::Tensor &linear_indices, const int64_t max_indices, const bool compute_count)¶
將索引去重複。
-
std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>, std::optional<at::Tensor>> get_unique_indices_with_inverse_cuda(const at::Tensor &linear_indices, const int64_t max_indices, const bool compute_count, const bool compute_inverse_indices)¶
將索引去重複。
-
std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>> lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter, const bool compute_inverse_indices)¶
查詢 LRU 快取以尋找未快取的索引,然後根據集合對其進行排序。
-
int64_t host_lxu_cache_slot(int64_t h_in, int64_t C)¶
將索引對應至 cache_set。h_in:linear_indices;C:#cache_sets。
-
at::Tensor linearize_cache_indices_cuda(const at::Tensor &cache_hash_size_cumsum, const at::Tensor &indices, const at::Tensor &offsets, const std::optional<at::Tensor> &B_offsets, const int64_t max_B, const int64_t indices_base_offset)¶
將所有表格的索引線性化,使其成為唯一值
-
at::Tensor linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)¶
將所有表格的索引線性化,使其成為唯一值。請注意,update_table_indices 和 update_row_indices 來自列索引格式,用於就地更新。
-
at::Tensor direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats)¶
LRU 快取:從
weights
擷取對應於linear_cache_indices
的列,並在時間步長time_stamp
將其插入快取中. void lru_cache_populate_cuda(
at::Tensor weights,
at::Tensor hash_size_cumsum,
int64_t total_cache_hash_size,
at::Tensor cache_index_table_map,
at::Tensor weights_offsets,
at::Tensor D_offsets,
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
at::Tensor lxu_cache_weights,
int64_t time_stamp,
at::Tensor lru_state,
bool stochastic_rounding,
bool gather_cache_stats,
std::optional<at::Tensor> uvm_cache_stats,
bool lock_cache_line,
std::optional<at::Tensor> lxu_cache_locking_counter);
/ / LRU 快取:從 /
weights
擷取對應於linear_cache_indices
的列,並在時間步長time_stamp
將其插入快取中. / weights 和 lxu_cache_weights 具有「uint8_t」位元組元素 void lru_cache_populate_byte_cuda(
at::Tensor weights,
at::Tensor hash_size_cumsum,
int64_t total_cache_hash_size,
at::Tensor cache_index_table_map,
at::Tensor weights_offsets,
at::Tensor weights_tys,
at::Tensor D_offsets,
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
at::Tensor lxu_cache_weights,
int64_t time_stamp,
at::Tensor lru_state,
int64_t row_alignment,
bool gather_cache_stats,
std::optional<at::Tensor> uvm_cache_stats);
/ / 直接對應 (assoc=1) 的 lru_cache_populate_byte_cuda 變體 void direct_mapped_lru_cache_populate_byte_cuda(
at::Tensor weights,
at::Tensor hash_size_cumsum,
int64_t total_cache_hash_size,
at::Tensor cache_index_table_map,
at::Tensor weights_offsets,
at::Tensor weights_tys,
at::Tensor D_offsets,
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
at::Tensor lxu_cache_weights,
int64_t time_stamp,
at::Tensor lru_state,
at::Tensor lxu_cache_miss_timestamp,
int64_t row_alignment,
bool gather_cache_stats,
std::optional<at::Tensor> uvm_cache_stats);
/ / LFU 快取:從 /
weights
擷取對應於linear_cache_indices
的列, 並將其插入快取中。void lfu_cache_populate_cuda(
at::Tensor weights,
at::Tensor cache_hash_size_cumsum,
int64_t total_cache_hash_size,
at::Tensor cache_index_table_map,
at::Tensor weights_offsets,
at::Tensor D_offsets,
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
at::Tensor lxu_cache_weights,
at::Tensor lfu_state,
bool stochastic_rounding);
/ / LFU 快取:從 /
weights
擷取對應於linear_cache_indices
的列, 並將其插入快取中。/ weights 和 lxu_cache_weights 具有「uint8_t」位元組元素 void lfu_cache_populate_byte_cuda(
at::Tensor weights,
at::Tensor cache_hash_size_cumsum,
int64_t total_cache_hash_size,
at::Tensor cache_index_table_map,
at::Tensor weights_offsets,
at::Tensor weights_tys,
at::Tensor D_offsets,
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
at::Tensor lxu_cache_weights,
at::Tensor lfu_state,
int64_t row_alignment);
/ / 查詢 LRU/LFU 快取:尋找所有索引的快取權重位置。/ 查詢快取中對應於
linear_cache_indices
的插槽, 並為遺失值提供 / 哨兵值。at::Tensor lxu_cache_lookup_cuda(
at::Tensor linear_cache_indices,
at::Tensor lxu_cache_state,
int64_t invalid_index,
bool gather_cache_stats,
std::optional<at::Tensor> uvm_cache_stats,
std::optional<at::Tensor> num_uniq_cache_indices,
std::optional<at::Tensor> lxu_cache_locations_output);
at::Tensor emulate_cache_miss(
at::Tensor lxu_cache_locations,
const int64_t enforced_misses_per_256,
const bool gather_cache_stats,
at::Tensor uvm_cache_stats);
/ / 查詢 LRU/LFU 快取:尋找所有索引的快取權重位置。/ 查詢快取中對應於
linear_cache_indices
的插槽,並為遺失值提供哨兵值。
-
void lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)¶
清空快取:將快取中的權重儲存到後端儲存空間。
-
void reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size)¶