If the list is sorted, for example, if you want to extract K elements out of N, but you do not care about their relative order, an efficient algorithm is proposed in the paper An Efficient Algorithm for Sequential Random Sampling (Jeffrey Scott Vitter, ACM Transactions on Mathematical Software, Vol. 13, No. 1, March 1987, Pages 56-67.).
edited to add the code in c++ using boost. I've just typed it and there might be many errors. The random numbers come from the boost library, with a stupid seed, so don't do anything serious with this.
/* Sampling according to [Vitter87].
*
* Bibliography
* [Vitter 87]
* Jeffrey Scott Vitter,
* An Efficient Algorithm for Sequential Random Sampling
* ACM Transactions on MAthematical Software, 13 (1), 58 (1987).
*/
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <string>
#include <iostream>
#include <iomanip>
#include <boost/random/linear_congruential.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/random/uniform_real.hpp>
using namespace std;
// This is a typedef for a random number generator.
// Try boost::mt19937 or boost::ecuyer1988 instead of boost::minstd_rand
typedef boost::minstd_rand base_generator_type;
// Define a random number generator and initialize it with a reproducible
// seed.
// (The seed is unsigned, otherwise the wrong overload may be selected
// when using mt19937 as the base_generator_type.)
base_generator_type generator(0xBB84u);
//TODO : change the seed above !
// Defines the suitable uniform ditribution.
boost::uniform_real<> uni_dist(0,1);
boost::variate_generator<base_generator_type&, boost::uniform_real<> > uni(generator, uni_dist);
void SequentialSamplesMethodA(int K, int N)
// Outputs K sorted random integers out of 0..N, taken according to
// [Vitter87], method A.
{
int top=N-K, S, curr=0, currsample=-1;
double Nreal=N, quot=1., V;
while (K>=2)
{
V=uni();
S=0;
quot=top/Nreal;
while (quot > V)
{
S++; top--; Nreal--;
quot *= top/Nreal;
}
currsample+=1+S;
cout << curr << " : " << currsample << "\n";
Nreal--; K--;curr++;
}
// special case K=1 to avoid overflow
S=floor(round(Nreal)*uni());
currsample+=1+S;
cout << curr << " : " << currsample << "\n";
}
void SequentialSamplesMethodD(int K, int N)
// Outputs K sorted random integers out of 0..N, taken according to
// [Vitter87], method D.
{
const int negalphainv=-13; //between -20 and -7 according to [Vitter87]
//optimized for an implementation in 1987 !!!
int curr=0, currsample=0;
int threshold=-negalphainv*K;
double Kreal=K, Kinv=1./Kreal, Nreal=N;
double Vprime=exp(log(uni())*Kinv);
int qu1=N+1-K; double qu1real=qu1;
double Kmin1inv, X, U, negSreal, y1, y2, top, bottom;
int S, limit;
while ((K>1)&&(threshold<N))
{
Kmin1inv=1./(Kreal-1.);
while(1)
{//Step D2: generate X and U
while(1)
{
X=Nreal*(1-Vprime);
S=floor(X);
if (S<qu1) {break;}
Vprime=exp(log(uni())*Kinv);
}
U=uni();
negSreal=-S;
//step D3: Accept ?
y1=exp(log(U*Nreal/qu1real)*Kmin1inv);
Vprime=y1*(1. - X/Nreal)*(qu1real/(negSreal+qu1real));
if (Vprime <=1.) {break;} //Accept ! Test [Vitter87](2.8) is true
//step D4 Accept ?
y2=0; top=Nreal-1.;
if (K-1 > S)
{bottom=Nreal-Kreal; limit=N-S;}
else {bottom=Nreal+negSreal-1.; limit=qu1;}
for(int t=N-1;t>=limit;t--)
{y2*=top/bottom;top--; bottom--;}
if (Nreal/(Nreal-X)>=y1*exp(log(y2)*Kmin1inv))
{//Accept !
Vprime=exp(log(uni())*Kmin1inv);
break;
}
Vprime=exp(log(uni())*Kmin1inv);
}
// Step D5: Select the (S+1)th record
currsample+=1+S;
cout << curr << " : " << currsample << "\n";
curr++;
N-=S+1; Nreal+=negSreal-1.;
K-=1; Kreal-=1; Kinv=Kmin1inv;
qu1-=S; qu1real+=negSreal;
threshold+=negalphainv;
}
if (K>1) {SequentialSamplesMethodA(K, N);}
else {
S=floor(N*Vprime);
currsample+=1+S;
cout << curr << " : " << currsample << "\n";
}
}
int main(void)
{
int Ntest=10000000, Ktest=Ntest/100;
SequentialSamplesMethodD(Ktest,Ntest);
return 0;
}
$ time ./sampling|tail
gives the following ouptut on my laptop
99990 : 9998882
99991 : 9998885
99992 : 9999021
99993 : 9999058
99994 : 9999339
99995 : 9999359
99996 : 9999411
99997 : 9999427
99998 : 9999584
99999 : 9999745
real 0m0.075s
user 0m0.060s
sys 0m0.000s