when i read intset datastruct source code. i think maybe we can split different types into different space. we alrealy know how big the inserted number is. so we can optimize intset and dont waste memory space . i try it with c++(the code can be refactor )
intset.h
#ifndef _IntSet_H
#define _IntSet_H
#include <cstdint>
#include <cstddef>
#include <cstdlib>
#include <cstring>
#include <algorithm>
#include <cassert>
#include <iostream>
const uint8_t ENCSET_ENC_INT16 = sizeof(int16_t);
const uint8_t ENCSET_ENC_INT32 = sizeof(int32_t);
const uint8_t ENCSET_ENC_INT64 = sizeof(int64_t);
const uint8_t ENCSET_ENC_ENCODINGS[] = {ENCSET_ENC_INT16, ENCSET_ENC_INT32, ENCSET_ENC_INT64};
const uint8_t ENCSET_ENC_NUM = sizeof(ENCSET_ENC_ENCODINGS) / sizeof(uint8_t);
const int FOUND = 1;
const int NOT_FOUND = -1;
struct EncSet {
friend std::ostream& operator<<(std::ostream &out, const EncSet &es);
public:
void MoveTail(size_t from, size_t to);
void Resize(size_t newlength);
void Set(size_t i, int64_t value);
int64_t Get(size_t pos) const;
int LowerBound(int64_t value);
EncSet();
~EncSet();
char* content;
uint8_t encoding;
uint32_t length;
};
class IntSet
{
friend std::ostream& operator<<(std::ostream &out, const IntSet &es);
private:
EncSet EncSets[ENCSET_ENC_NUM];
size_t size;
uint8_t _ValueEncoding(int64_t v);
size_t _FindEncSet(uint8_t encoding);
uint64_t _Get(size_t pos);
int _Find(int64_t value);
public:
int Insert(int64_t value);
int Remove(int64_t value);
int Find(int64_t value);
uint64_t Random();
int Get(size_t pos, int64_t *value);
size_t Size();
size_t BlobSize();
IntSet();
~IntSet();
};
#endif
intset.cpp
#include "intset.h"
std::ostream& operator<<(std::ostream &out, const EncSet &es) {
for(int i = 0; i < es.length; ++i){
out << es.Get(i);
if( i != es.length - 1) out << " ";
}
return out;
};
std::ostream& operator<<(std::ostream &out, const IntSet &is) {
for(int i = 0; i < ENCSET_ENC_NUM; ++i){
out << is.EncSets[i];
if(i != ENCSET_ENC_NUM - 1) out << "\n";
}
return out;
};
EncSet::EncSet(){
content = NULL;
length = 0;
}
EncSet::~EncSet(){
free(content);
}
void EncSet::MoveTail(size_t from, size_t to){
void *src, *dst;
size_t bytes = length - from;
if(encoding == ENCSET_ENC_INT16){
src = (int16_t*)content + from;
dst = (int16_t*)content + to;
bytes *= sizeof(int16_t);
} else if(encoding == ENCSET_ENC_INT32){
src = (int32_t*)content + from;
dst = (int32_t*)content + to;
bytes *= sizeof(int32_t);
} else {
src = (int64_t*)content + from;
dst = (int64_t*)content + to;
bytes *= sizeof(int64_t);
}
memmove(dst, src, bytes);
}
void EncSet::Resize(size_t newlength){
content = (char*)realloc(content, newlength * encoding);
}
void EncSet::Set(size_t i, int64_t value){
if(encoding == ENCSET_ENC_INT16) {
((int16_t*)content)[i] = value;
} else if(encoding == ENCSET_ENC_INT32) {
((int32_t*)content)[i] = value;
} else {
((int64_t*)content)[i] = value;
}
}
int64_t EncSet::Get(size_t pos) const {
assert(pos < length);
if(encoding == ENCSET_ENC_INT16) {
return *((int16_t*)content + pos);
} else if(encoding == ENCSET_ENC_INT32) {
return *((int32_t*)content + pos);
} else {
return *((int64_t*)content + pos);
}
}
int EncSet::LowerBound(int64_t value){
int i = 0;
if(encoding == ENCSET_ENC_INT16) {
i = std::lower_bound((int16_t*)content, (int16_t*)content + length, (int16_t)value) - (int16_t*)content;
} else if(encoding == ENCSET_ENC_INT32) {
i = std::lower_bound((int32_t*)content, (int32_t*)content + length, (int32_t)value) - (int32_t*)content;
} else {
i = std::lower_bound((int64_t*)content, (int64_t*)content + length, (int64_t)value) - (int64_t*)content;
}
return i;
}
IntSet::IntSet(){
for(int i = 0; i < ENCSET_ENC_NUM; ++i){
EncSets[i].content = NULL;
EncSets[i].encoding = ENCSET_ENC_ENCODINGS[i];
EncSets[i].length = 0;
}
size = 0;
}
IntSet::~IntSet(){
}
size_t IntSet::_FindEncSet(uint8_t encoding) {
for(int i = 0; i < ENCSET_ENC_NUM; ++i){
if(encoding == EncSets[i].encoding) return i;
}
};
uint8_t IntSet::_ValueEncoding(int64_t v){
if(v < INT32_MIN || v > INT32_MAX){
return ENCSET_ENC_INT64;
} else if(v < INT16_MIN || v > INT16_MAX){
return ENCSET_ENC_INT32;
} else {
return ENCSET_ENC_INT16;
}
}
int IntSet::Insert(int64_t v){
auto i = _FindEncSet(_ValueEncoding(v));
auto j = EncSets[i].LowerBound(v);
if(j != EncSets[i].length && EncSets[i].Get(j) == v) return -1;
EncSets[i].Resize(EncSets[i].length + 1);
if(j != EncSets[i].length){
EncSets[i].MoveTail(j, j + 1);
}
EncSets[i].Set(j, v);
EncSets[i].length += 1;
size += 1;
return 1;
}
int IntSet::Remove(int64_t v){
auto i = _FindEncSet(_ValueEncoding(v));
auto j = EncSets[i].LowerBound(v);
if(j == EncSets[i].length || EncSets[i].Get(j) != v) return -1;
if(j < EncSets[i].length - 1){
EncSets[i].MoveTail(j+1, j);
}
EncSets[i].Resize(EncSets[i].length - 1);
EncSets[i].length -= 1;
size -= 1;
return 1;
}
int IntSet::Find(int64_t v){
auto i = _FindEncSet(_ValueEncoding(v));
auto j = EncSets[i].LowerBound(v);
if(j != EncSets[i].length && EncSets[i].Get(j) == v) return 1;
return -1;
}
uint64_t IntSet::_Get(size_t pos){
for(size_t i = 0; i < ENCSET_ENC_NUM; ++i){
if(pos >= EncSets[i].length) pos -= EncSets[i].length;
else {
return EncSets[i].Get(pos);
}
}
}
uint64_t IntSet::Random(){
return _Get(rand() % size);
}
int IntSet::Get(size_t pos, int64_t* value){
if(pos < size){
if(value) *value = _Get(pos);
return 1;
}
return -1;
}
size_t IntSet::Size(){
return size;
}
size_t IntSet::BlobSize(){
size_t s = 0;
for(int i = 0; i < ENCSET_ENC_NUM; ++i){
s += EncSets[i].length * EncSets[i].encoding;
}
return sizeof(IntSet) + s;
}
Comment From: oranagra
@qbit-s I don't understand what you are suggesting. The intset code already has the categories to allocate memory according to the size of the value.
BTW, redis is written in C, if you have an idea for improvement, please submit a PR with C code.
Comment From: namespace-io
@oranagra i mean when i insert a number v, we used to judge how big the v is and get the encoding for the provided value v. that is function _intsetValueEncoding(int64_t v) . then we decide to upgrade or not. if we upgrade, it will waste space . so we can preallocate 3 or more different buckets to store different encoding values so that no space will be wasted. for example: first i have 3 bucket : ENCSET_ENC_INT16(only store int16 data, every element is 2Bytes), ENCSET_ENC_INT32(only store int32 data, every element is 4Bytes), ENCSET_ENC_INT64(only store int64 data, every element is 8Bytes)
for -1, 4 ,1, 65536, first we judge how big the v is and put v in right buckets -1, 4 ,1 will be stored in ENCSET_ENC_INT16 bucket: -1 1 4 65536 will be stored in ENCSET_ENC_INT32 bucket: 65536
every buckets stores numbers by order so that binary search can be done
Comment From: oranagra
@qbit-s that's a good idea, kinda like storing 3 different intsets. one problem i see with this is that in order to persist it (quickly and efficiently) we need to change the rdb format (so we can only do that in the next redis version).
another thought is that intsets are usually small (limited to 512 entries by default), so i'm not sure there are many use cases that store both very big numbers and small numbers (and no strings) in one set.
in theory another optimization that can be made is add some base offset, so if you want to store 20 consecutive numbers of a high range it'll still be compact. it can be something like the quicklists we have today (which are a linked list of binary blobs). but i'm not sure if the use cases it improves justify the added complexity (current implementation is very naive and simple)
Comment From: madolson
I sort of like the optimization, but I agree with @oranagra that I'm not sure it justifies the complexity. I know of very few use cases that use intsets that aren't already storing items of about the same size. I've seen many more examples for building a list encoding for small sets.