注意
點擊這裡下載完整範例程式碼
(原型) FX Graph Mode 後訓練動態量化¶
建立於:2021 年 1 月 5 日 | 上次更新:2024 年 8 月 27 日 | 上次驗證:2024 年 11 月 05 日
作者: Jerry Zhang
本教學介紹基於 torch.fx
在圖形模式下進行後訓練動態量化的步驟。 我們有另一個關於FX Graph Mode 後訓練靜態量化的教學,FX Graph Mode 量化和 Eager Mode 量化之間的比較可以在量化文件中找到。
總結; 動態量化的 FX Graph Mode API 如下所示
import torch
from torch.ao.quantization import default_dynamic_qconfig, QConfigMapping
# Note that this is temporary, we'll expose these functions to torch.ao.quantization after official releasee
from torch.quantization.quantize_fx import prepare_fx, convert_fx
float_model.eval()
# The old 'fbgemm' is still available but 'x86' is the recommended default.
qconfig = get_default_qconfig("x86")
qconfig_mapping = QConfigMapping().set_global(qconfig)
prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs) # fuse modules and insert observers
# no calibration is required for dynamic quantization
quantized_model = convert_fx(prepared_model) # convert the model to a dynamically quantized model
在本教學中,我們將對基於 LSTM 的下一個單詞預測模型應用動態量化,並密切關注 PyTorch 範例中的單詞語言模型。 我們將從LSTM 單詞語言模型上的動態量化複製程式碼,並省略描述。
1. 定義模型、下載資料和模型¶
下載資料並解壓縮到資料夾
mkdir data
cd data
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
unzip wikitext-2-v1.zip
將模型下載到資料夾
wget https://s3.amazonaws.com/pytorch-tutorial-assets/word_language_model_quantize.pth
定義模型
# imports
import os
from io import open
import time
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
# Model Definition
class LSTMModel(nn.Module):
"""Container module with an encoder, a recurrent module, and a decoder."""
def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
super(LSTMModel, self).__init__()
self.drop = nn.Dropout(dropout)
self.encoder = nn.Embedding(ntoken, ninp)
self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
self.decoder = nn.Linear(nhid, ntoken)
self.init_weights()
self.nhid = nhid
self.nlayers = nlayers
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, input, hidden):
emb = self.drop(self.encoder(input))
output, hidden = self.rnn(emb, hidden)
output = self.drop(output)
decoded = self.decoder(output)
return decoded, hidden
def init_hidden(lstm_model, bsz):
# get the weight tensor and create hidden layer in the same device
weight = lstm_model.encoder.weight
# get weight from quantized model
if not isinstance(weight, torch.Tensor):
weight = weight()
device = weight.device
nlayers = lstm_model.rnn.num_layers
nhid = lstm_model.rnn.hidden_size
return (torch.zeros(nlayers, bsz, nhid, device=device),
torch.zeros(nlayers, bsz, nhid, device=device))
# Load Text Data
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
class Corpus(object):
def __init__(self, path):
self.dictionary = Dictionary()
self.train = self.tokenize(os.path.join(path, 'wiki.train.tokens'))
self.valid = self.tokenize(os.path.join(path, 'wiki.valid.tokens'))
self.test = self.tokenize(os.path.join(path, 'wiki.test.tokens'))
def tokenize(self, path):
"""Tokenizes a text file."""
assert os.path.exists(path)
# Add words to the dictionary
with open(path, 'r', encoding="utf8") as f:
for line in f:
words = line.split() + ['<eos>']
for word in words:
self.dictionary.add_word(word)
# Tokenize file content
with open(path, 'r', encoding="utf8") as f:
idss = []
for line in f:
words = line.split() + ['<eos>']
ids = []
for word in words:
ids.append(self.dictionary.word2idx[word])
idss.append(torch.tensor(ids).type(torch.int64))
ids = torch.cat(idss)
return ids
model_data_filepath = 'data/'
corpus = Corpus(model_data_filepath + 'wikitext-2')
ntokens = len(corpus.dictionary)
# Load Pretrained Model
model = LSTMModel(
ntoken = ntokens,
ninp = 512,
nhid = 256,
nlayers = 5,
)
model.load_state_dict(
torch.load(
model_data_filepath + 'word_language_model_quantize.pth',
map_location=torch.device('cpu'),
weights_only=True
)
)
model.eval()
print(model)
bptt = 25
criterion = nn.CrossEntropyLoss()
eval_batch_size = 1
# create test data set
def batchify(data, bsz):
# Work out how cleanly we can divide the dataset into bsz parts.
nbatch = data.size(0) // bsz
# Trim off any extra elements that wouldn't cleanly fit (remainders).
data = data.narrow(0, 0, nbatch * bsz)
# Evenly divide the data across the bsz batches.
return data.view(bsz, -1).t().contiguous()
test_data = batchify(corpus.test, eval_batch_size)
example_inputs = (next(iter(test_data))[0])
# Evaluation functions
def get_batch(source, i):
seq_len = min(bptt, len(source) - 1 - i)
data = source[i:i+seq_len]
target = source[i+1:i+1+seq_len].reshape(-1)
return data, target
def repackage_hidden(h):
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance(h, torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
def evaluate(model_, data_source):
# Turn on evaluation mode which disables dropout.
model_.eval()
total_loss = 0.
hidden = init_hidden(model_, eval_batch_size)
with torch.no_grad():
for i in range(0, data_source.size(0) - 1, bptt):
data, targets = get_batch(data_source, i)
output, hidden = model_(data, hidden)
hidden = repackage_hidden(hidden)
output_flat = output.view(-1, ntokens)
total_loss += len(data) * criterion(output_flat, targets).item()
return total_loss / (len(data_source) - 1)
2. 後訓練動態量化¶
現在我們可以動態量化模型。 我們可以使用與後訓練靜態量化相同的功能,但使用動態 qconfig。
from torch.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import default_dynamic_qconfig, float_qparams_weight_only_qconfig, QConfigMapping
# Full docs for supported qconfig for floating point modules/ops can be found in `quantization docs <https://pytorch.dev.org.tw/docs/stable/quantization.html#module-torch.quantization>`_
# Full docs for `QConfigMapping <https://pytorch.dev.org.tw/docs/stable/generated/torch.ao.quantization.qconfig_mapping.QConfigMapping.html#torch.ao.quantization.qconfig_mapping.QConfigMapping>`_
qconfig_mapping = (QConfigMapping()
.set_object_type(nn.Embedding, float_qparams_weight_only_qconfig)
.set_object_type(nn.LSTM, default_dynamic_qconfig)
.set_object_type(nn.Linear, default_dynamic_qconfig)
)
# Load model to create the original model because quantization api changes the model inplace and we want
# to keep the original model for future comparison
model_to_quantize = LSTMModel(
ntoken = ntokens,
ninp = 512,
nhid = 256,
nlayers = 5,
)
model_to_quantize.load_state_dict(
torch.load(
model_data_filepath + 'word_language_model_quantize.pth',
map_location=torch.device('cpu')
)
)
model_to_quantize.eval()
prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
print("prepared model:", prepared_model)
quantized_model = convert_fx(prepared_model)
print("quantized model", quantized_model)
對於動態量化物件,我們在 prepare_fx
中沒有對模組做任何事情,但會為動態可量化的 forunctionals 和 torch ops 插入權重量測器。 我們還融合了諸如 Conv + Bn、Linear + ReLU 之類的模組。
在轉換中,我們會將浮點模組轉換為動態量化模組,並將浮點運算轉換為動態量化運算。 我們可以在範例模型中看到,nn.Embedding
、nn.Linear
和 nn.LSTM
已被動態量化。
現在我們可以比較量化模型的大小和執行時間。
def print_size_of_model(model):
torch.save(model.state_dict(), "temp.p")
print('Size (MB):', os.path.getsize("temp.p")/1e6)
os.remove('temp.p')
print_size_of_model(model)
print_size_of_model(quantized_model)
由於我們將模型中的所有權重(nn.Embedding、nn.Linear 和 nn.LSTM)從浮點數(4 個位元組)量化為量化整數(1 個位元組),因此大小減少了 4 倍。
torch.set_num_threads(1)
def time_model_evaluation(model, test_data):
s = time.time()
loss = evaluate(model, test_data)
elapsed = time.time() - s
print('''loss: {0:.3f}\nelapsed time (seconds): {1:.1f}'''.format(loss, elapsed))
time_model_evaluation(model, test_data)
time_model_evaluation(quantized_model, test_data)
此模型的速度大約加快了 2 倍。 另請注意,速度提升可能因模型、裝置、建置、輸入批次大小、線程等而異。