I recently encountered this issue myself and had to write a script to reduce the model size. The fasttext C code includes a handy function "threshold" to reduce dictionary size, but it's not exposed to the python bindings. After the dictionary reduction you also need to re-build the input matrix, including the ngram buckets that come after the main word vectors. After saving the model in this way, all word vectors will be generated from the subword information alone (except for the dictionary words that remain)
for word similarity search, output_ and model_ are not used. To save more memory you can also just comment out the part that writes output_ in saveModel
note that the ngram entries by themselves are about 2Gb in the english pretrained model, so that's about the smallest you can make the model even if all dictionary words are removed.
/* note: some dict_ members are public for easier access */
void FastText::quantize(const Args& qargs) {
/*if (args_->model != model_name::sup) {
throw std::invalid_argument(
"For now we only support quantization of supervised models");
}*/
args_->input = qargs.input;
args_->qout = qargs.qout;
args_->output = qargs.output;
std::shared_ptr<DenseMatrix> input =
std::dynamic_pointer_cast<DenseMatrix>(input_);
std::shared_ptr<DenseMatrix> output =
std::dynamic_pointer_cast<DenseMatrix>(output_);
bool normalizeGradient = (args_->model == model_name::sup);
if (qargs.cutoff > 0 && qargs.cutoff < input->size(0)) {
/*auto idx = selectEmbeddings(qargs.cutoff);
dict_->prune(idx);*/
int32_t rows = dict_->size_+args_->bucket;
dict_->threshold(2000, 2000);
std::cerr << "words: " << dict_->size_ << std::endl;
std::cerr << "rows: " << rows << std::endl;
/*std::shared_ptr<DenseMatrix> ninput =
std::make_shared<DenseMatrix>(idx.size(), args_->dim);*/
int32_t new_rows = dict_->size_+args_->bucket;
std::shared_ptr<DenseMatrix> ninput = std::make_shared<DenseMatrix>(dict_->size_+args_->bucket, args_->dim);
for (auto i = 0; i < dict_->size_; i++) {
for (auto j = 0; j < args_->dim; j++) {
int32_t index = dict_->getId(dict_->words_[i].word);
ninput->at(i, j) = input->at(index, j);
}
}
int32_t offset = rows-new_rows;
for (auto i = dict_->size_; i < new_rows; i++) {
for (auto j = 0; j < args_->dim; j++) {
ninput->at(i, j) = input->at(i+offset, j);
}
}
/*input = ninput;*/
input_ = ninput;
if (qargs.retrain) {
args_->epoch = qargs.epoch;
args_->lr = qargs.lr;
args_->thread = qargs.thread;
args_->verbose = qargs.verbose;
auto loss = createLoss(output_);
model_ = std::make_shared<Model>(input, output, loss, normalizeGradient);
startThreads();
}
}
/*input_ = std::make_shared<QuantMatrix>(
std::move(*(input.get())), qargs.dsub, qargs.qnorm);*/
/*if (args_->qout) {
output_ = std::make_shared<QuantMatrix>(
std::move(*(output.get())), 2, qargs.qnorm);
}
*/
/*quant_ = true;*/
auto loss = createLoss(output_);
model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
}