Monday, October 4, 2010

Log-logistic mixture on Google Code

I am still wrestling with TortoiseSVN on my windows machine, but the R code can now be downloaded from Google Code:

http://code.google.com/p/loglogisticmixture

No sample data or code there yet, but I will in a few days.

Thursday, September 30, 2010

llmixdist.r: a collection of functions for log-logistic distributions

This is an aux script that I source() in every time I need to work with log-logistic distributions and their mixtures for my work. I emphasize it's for my own work ONLY, because I only need a mixture of 2.5 distributions (ok, 2 roughly free component and one that is fixed except for mixing prob.). I can estimate parameters using brute force MLE, but I don't think it generalizes to n-mix models.

So this script handles the {probability, pdf, hazard, survival} functions of {shifted log-logistic, 2 shifted log-logistic mix, 2.5-shifted-log-logistic mix}.

It also contain a getDistr() function to get empirical pdf/haz functions from data.



# Gary Feng,
# Sept 2010, Potsdam, Germany
# Maxmum Likelihood Estimation of the Log-logistic Mixture model for Fixation Duration
# in Feng (in prep.)
# This is a mixture of 2 log-logistic distributions, with a shift parameter and a mixing prob. parameter
# Note: The MLE seems to be pretty sensitive to the initial values
# Testing with 50% of variation in the initial parameters and different sameple sizes.
# Also, the LL for the "true" model may not be the maximum compared to other possible models,
# due to random errors. I often find the estimated para are actually better than the "true" para.
# An issue of sample size?
# to-do:
# ---1). Tieing the location parameters
# ---2). adding the express saccade prob mass estimation
# 3). use LeastSquare or Chi-Square methods to estimate the init parameters
# ---4). detect fit failure and automatically re-estimate using different init-para
# ---5). some ways to optimize globally, using other optim algorithms? Grid search?
# -X-6). bayesian estimation?? <-- way to slow and not practical for large datasets.
# ---7). Add AIC/BIC

###############################
# get pdf & haz
###############################
library(muhaz);
getDistr <-function (fixdur, binwidth=2, maxtime=1000, mintime=50) {
fixdur<-round(fixdur/binwidth)*binwidth
status<-rep(1,length(fixdur))
out1<-kphaz.fit(fixdur,status)

# get the density fun
fixhist <-hist(fixdur, c(0, out1$time, max(fixdur)), plot=F)

# the lengths don't match.
n<-length(fixhist$density);
#out1$time2 <-fixhist$breaks[2:n]
out1$pdf <-fixhist$density[2:n]
out1$counts <-fixhist$counts[2:n]

time2<-seq(0, max(fixdur), binwidth);
fixhist2 <-hist(fixdur, breaks=time2, plot=F)
out1$time2<-fixhist2$breaks[2:length(fixhist2$breaks)-1];
out1$pdf2<-fixhist2$density;
out1$counts2 <-fixhist2$counts

return (out1)
}

###############################
# global vars
###############################
lx.default<-80; sx.default<-4; # for the Px component

###############################
# Log-logistic functions
###############################
# functions for Log-logistic density, cumulative prob., and random number generation
# random log-logistic variable generator, by reversing the CDF
rllogis <-function (n, location = 0, scale = 1)
{
    u <-runif(n)
location * (u/(1-u))^(1/scale)
}
# (cumulative) prob. function
pllogis <- function (q, location, scale, d, lower.tail = TRUE, log.p = FALSE)
{
    #if (q<0) return (0)
#if (q<d) return (0)
p <- 1/(1+((q-d)/location)^(-scale))
# make sure it's valid prob.
p[q<0] <-0;
p[q<d] <-0;
if (!lower.tail) p <- 1-p;
if (log.p) p <-log(p);
return (p)
}

# density of log-logistic
dllogis <-function (x, location, scale, d) {
    if (any(x <= 0))
        stop("x elements must be strictly positive")
    x[x<=d] <- d+.Machine$double.xmin
# .Machine.double.xmin
a=scale;
b=1/location;
p<-a * b *(b * (x-d))^(a-1) * (1+(b * (x-d))^a)^-2;
return (p);
}

# density for 2-log-logistic-mixture
# had to make sure it doesn't return 0
dll2 <- function (x, location1, scale1, location2, scale2, mixp, d)
{
    if (any(x <= 0))
        stop("x elements must be strictly positive")
p<- mixp * dllogis(x, location1, scale1, 0) + (1-mixp) * dllogis(x, location2, scale2, d)
#mixp * dlogis(log(x), location1, scale1)/x
#+ (1-mixp) * dlogis(log(x), location2, scale2)/(x)

# .Machine$double.xmin
p[p<=0]<-.Machine$double.xmin
p[p>1] <-.Machine$double.xmin
return(p)

}

# density for 2-log-logistic-mixture + eXpress saccade
# Px is free, location=lx=a=80, shape=sx=b=6 by default; SD(80/6)=26.84; SD(80/8)=19.22;
# px_mean =a * pi/b/sin(pi/b)
# px_sd = sqrt(a^2*pi/b*(2/sin(2*pi/b) - pi/b*(sin(pi/b))^-2))
dll2x <- function (x, location1, scale1, location2, scale2, mixp, d, px, lx=lx.default, sx=sx.default)
{
    if (any(x <= 0))
        stop("x elements must be strictly positive")
# p<- mixp * dllogis(x, location1, scale1, 0) + (1-mixp) * dllogis(x, location2, scale2, d)
p<- px * dllogis(x, lx, sx, 0) + (1-px)*(mixp * dllogis(x, location1, scale1, 0) + (1-mixp) * dllogis(x, location2, scale2, d));
#mixp * dlogis(log(x), location1, scale1)/x
#+ (1-mixp) * dlogis(log(x), location2, scale2)/(x)

# .Machine$double.xmin
p[p<=0]<-.Machine$double.xmin
p[p>1] <-.Machine$double.xmin
return(p)

}

# (cumulative) probability for 2-log-logistic-mixture
pll2 <- function (x, location1, scale1, location2, scale2, mixp, d)
{
    if (any(x < 0))
        stop("x elements must be strictly positive")
p<- mixp * pllogis(x, location1, scale1, 0) + (1-mixp) * pllogis(x, location2, scale2, d)
return(p)

}
# probability for 2-log-logistic-mixture + eXpress saccade
# Px is free, location=lx=a=80, shape=sx=b=6 by default; SD(80/6)=26.84; SD(80/8)=19.22;
# px_mean =a * pi/b/sin(pi/b)
# px_sd = sqrt(a^2*pi/b*(2/sin(2*pi/b) - pi/b*(sin(pi/b))^-2))
pll2x <- function (x, location1, scale1, location2, scale2, mixp, d, px, lx=lx.default, sx=sx.default)
{
    if (any(x <= 0))
        stop("x elements must be strictly positive")
# p<- mixp * dllogis(x, location1, scale1, 0) + (1-mixp) * dllogis(x, location2, scale2, d)
p<- px * pllogis(x, lx, sx, 0) + (1-px)*(mixp * pllogis(x, location1, scale1, 0) + (1-mixp) * pllogis(x, location2, scale2, d));
#mixp * dlogis(log(x), location1, scale1)/x
#+ (1-mixp) * dlogis(log(x), location2, scale2)/(x)

# .Machine$double.xmin
p[p<=0]<-.Machine$double.xmin
p[p>1] <-.Machine$double.xmin
return(p)

}

# survival function for 2-log-logistic-mixture
sll2 <- function (x, location1, scale1, location2, scale2, mixp, d)
{
return(1-pll2(x, location1, scale1, location2, scale2, mixp, d))
}
# survival function for 2-log-logistic-mixture with eXpress saccade component
# use default lx, sx parameters
sll2x <- function (x, location1, scale1, location2, scale2, mixp, d, px)
{
return(1-pll2x(x, location1, scale1, location2, scale2, mixp, d, px))
}

# hazard rate for 2-log-logistic-mixture
hll2 <- function (x, location1, scale1, location2, scale2, mixp, d)
{
h<-dll2(x, location1, scale1, location2, scale2, mixp, d)/sll2(x, location1, scale1, location2, scale2, mixp, d)
return(h)
}
# hazard rate for 2-log-logistic-mixture, with eXpress saccade
hll2x <- function (x, location1, scale1, location2, scale2, mixp, d, px)
{
h<-dll2x(x, location1, scale1, location2, scale2, mixp, d, px)/sll2x(x, location1, scale1, location2, scale2, mixp, d, px)
return(h)
}


Eval, Parse, and a little magic in R

Magic in the sense it really saved me loads of troubles.

So following my last post, where I could parse a model specification into a list of useful things. Now let's use the parse in a MLE function.

So parsepara() returns a list, for example:

$input
[1] "145, 8, 145, ?6/7/3, ?0.7|0.3|0.9, ?100|150|60, ?0.05|0.15|0.01"

$paralist
     145        8      145                                  
   "145"      "8"    "145" "par[1]" "par[2]" "par[3]" "par[4]"

$initpara
    6   0.7   100  0.05
6e+00 7e-01 1e+02 5e-02

$upper
[1]   7.00   0.90 150.00   0.15

$lower
[1]  3.00  0.30 60.00  0.01


$paralist is the most important part. Note it's a list of strings, even for numerals. You can use this to create a log-likelihood function where it can take any combinations of fix/free/tied parameters. I also used it for drawing the model -- I needed to use the estimated parameters to generate a theoretical line. To do so I also needed both free and fixed parameters. Fortunately I can just replace "par" with anything that gives me the MLE parameters. See the code below, look for "gsub(...) function.

$initpara is the "real" parameter list, one that will be used by optim(). It contains ONLY free parameters. Look at the code to see how tied parameters are handled. But the length of $initpara is the # of parameters of the model.

$upper/$lower are the bounds. Doesn't matter the order you specify them; the larger will be upper, and smaller be lower. And if you don't specify, you get Inf/-Inf.

I highlighted places where some tricks are used.


fitll2_2 <- function(data, strpara, maxtime=1000, ylim=0.010) {
if (missing(data)) {
cat("Data should be a vector of SRT/fixation duration\n");
return()
} else if (class(data)!=  "numeric" & class(data)!=  "list") {
cat("Data should be a vector or a list of vectors of SRT/fixation duration\n");
return()
} else if (missing(strpara)) {
cat("Please supply the correct parameters; see 'parsepara.r' for instructions\n");
return()
}
if (class(data) == "list") {datalist<-data;}
else {datalist<-list(data=data);}
# now parse the LL parameters
para<-parsepara(strpara); #parse the parameter, return a list of named lists, see parsepara.r
if (is.null(para)) {
cat("Please supply the correct parameters; see 'parsepara.r' for instructions\n");
return()
}
strparalist<-paste(para$paralist, collapse=", "); cat("para=",strparalist,"\n"); llfun<-eval(parse(text = paste("function(par) {-sum(log(dll2x(data, ", strparalist, ")));}")));
initpara<-para$initpara; upperpara<-para$upper; lowerpara<-para$lower;
# set up things
fit <-list(); stats <-list();colors <-c('red', 'blue','black', 'green', 'brown', 'grey');
# now do the estimates

 counter<-1;
for (i in names(datalist)) {
data<-datalist[[i]]; data<-data[data>0]; data<-data[is.finite(data)];
mleFit<- optim(fn = eval(parse(text = paste("function(par) {-sum(log(dll2x(data, ", strparalist, ")));}"))), par = initpara, method = "L-BFGS-B", upper=upperpara, lower=lowerpara, hessian = T)
mleFit$initpara<-initpara;
mleFit$npara <-length(initpara); # only non identical upper/lower limites
mleFit$n <- length(data);
mleFit$BIC <- log1p(mleFit$n)*mleFit$npara + 2*mleFit$value; #using log1p to prevent n=0; shouldn't be an issue
mleFit$AIC <- 2*mleFit$npara + 2*mleFit$value;
fit[[i]]  <-mleFit;
stat[[i]] <-getDistr(data, binwidth=20);

counter<-counter+1;
}

if (T) {
# set up the pdf
plot(0,0, type='n', xlim=c(0,min(maxtime, 1000)), ylim=c(0,ylim),frame.plot=FALSE, xlab="Fixation Duration", ylab="Probability Density");
c<-1; time2<-seq(10, min(maxtime, 1000), by=1);  counter<-1;
for (i in names(fit)) {
lines(stat[[i]]$time, stat[[i]]$pdf, col=colors[c], lty='dashed');
#lines(time2-10, dll2x(time2, a, b1, a, b2, mixp, d, px), col=colors[c]); # shared L parameter
lines(time2-10, eval(parse(text = paste("dll2x(time2, ", gsub("par", "fit[[i]]$par", strparalist), ")"))), col=colors[c]); # paralist replaced with fit[[i]]$par
text(min(maxtime, 1000)*0.3, (ylim-c*0.0015), paste(c("parameters=", unlist(para$paralist)), collapse=", "), col=colors[c], pos=4, cex=0.8);
text(min(maxtime, 1000)*0.3, (ylim-c*0.0015-0.0005), paste(c(i, unlist(round(fit[[i]]$par, 2))), collapse=", "), col=colors[c], pos=4, cex=0.8);
text(min(maxtime, 1000)*0.3, (ylim-c*0.0015-0.0010), paste(c('n=', 'LL=', 'AIC=', 'BIC='), c(unlist(round(c(fit[[i]]$n, fit[[i]]$value, fit[[i]]$AIC, fit[[i]]$BIC), 2))), collapse=", "), col=colors[c], pos=4, cex=0.8);
c=c+1; counter<-counter+1;
}
c<-1; counter<-1;
#set up the haz; missing the eXpress saccade component
plot(0,0, type='n', xlim=c(0,min(maxtime, 1000)), ylim=c(0,ylim*2), frame.plot=FALSE, xlab="Fixation Duration", ylab="Hazard Rate");
for (i in names(fit)) {
lines(stat[[i]]$time, stat[[i]]$haz, col=colors[c], lty='dashed');
a<-fit[[i]]$par[1]; b1<-fit[[i]]$par[2]; b2<-fit[[i]]$par[3];
mixp<-fit[[i]]$par[4]; d<-fit[[i]]$par[5]; px<-fit[[i]]$par[6];
#lines(time2-10, hll2x(time2, a, b1, a, b2, mixp, d, px), col=colors[c]); # No Px parameter
lines(time2-10, eval(parse(text = paste("hll2x(time2, ", gsub("par", "fit[[i]]$par", strparalist), ")"))), col=colors[c]); # paralist replaced with fit[[i]]$par
c=c+1; counter<-counter+1;
}

}

return (mleFit)
}

ParsePara: a little R parser to speed up MLE specifications

For my MLE of Log-logistic mixtures, I have to specify parameters, initial values, upper and lower bounds, etc. And to fix or tie parameters I had to rewrite the log-likelihood function, etc. A whole mess.

So here comes a little script to parse a special way to specify fixed, free, and tied parameters. Then I combine these with eval(parse( text="....")) command in R to run all combinations of models with a single function.

Here goes the code.



##############
# Parse parameters for the MLE estimation routine
# Gary Feng, copyleft, Sept 2010, Potsdam, Germany
##############
# Format of the input parameter specification:
#    e.g., str<-"2, ?0.05,3, ?200//400||100, =1"
#  Any numeric values --> fixed parameters, will not generate parameter list
#  ?## --> free parameter, ## is the initial value, upper and lower limites set to Inf, -Inf
#  ?##1/##2/##3 --> bounded parameter, ##1 is the init value, ##2 and ##3 will be the upper/lower bounds, auto re-ordered
#  =# --> tied parameter, pointing to parameter number #. 
#  @counter*2+180 --> a formula: CAUTION, make sure you know what you are doing.
# the string after @ will be treated as a formula and will be evaluated at the run time, 
# make sure it is a valid expression. So far the only "variable" it can use is "counter", which is an internal
# variable in fitll2() that keeps track of which subset (list) of the data is currently being fitted (starting 1)
##############
# Returns
#  a list of:
#   -- input: original string
#   -- paralist: list of parameters, with free/tied parameters replaced with "para[#]"; 
# Note the # of items is the same as in the input; i.e., fixed parameters are in here, in the same order. 
#   -- initpara/upperpara/lowerpara: bounds and init paras, with # equal to the # of free parameters. 
##############


parsepara <-function (str) {
res<-sapply(strsplit(str, "[ ]*,[ ]*")[[1]], as.numeric); #names(res); res
initpara<-c(); upperpara<-c(); lowerpara<-c(); paralist<-c(); c<-1;
for (n in names(res)) {
#cat(n, "\n");
if(!is.na(res[n])) {
paralist<-c(paralist, res[n]);#cat("paralist= ",paralist, "\n");
} else if(substring(n, 1,1)=='?') {
dump<- substring(n, 2); # get the rest of the strings
#cat("dump= ",dump, "\n");
args<- sapply(strsplit(dump, "[/|\\]+")[[1]], as.numeric)
#cat("args= ",args, "\n");
if(any(is.na(args))) cat("Error! Some parameters are not numeric: ", args)
if(length(args)==3) {
initpara<-c(initpara, args[1]); #cat("initpara= ",initpara, "\n");
upperpara<-c(upperpara,max(args[2],args[3])); #cat("upperpara= ",upperpara, "\n");;
lowerpara<-c(lowerpara,min(args[2],args[3])); #cat("lowerpara= ",lowerpara, "\n");;
} else if(length(args)==1) {
initpara<-c(initpara, args[1]); #cat("initpara= ",initpara, "\n");
upperpara<-c(upperpara,Inf); #cat("upperpara= ",upperpara, "\n");;
lowerpara<-c(lowerpara,-Inf); #cat("lowerpara= ",lowerpara, "\n");;
} else {
cat("Error! Format= '?initpara/upper/lower': ", args);
return()
}
paralist<-c(paralist, paste("par[",c,"]", sep='')); #cat("paralist= ",paralist, "\n");
c<-c+1;
} else if(substring(n, 1,1)=='=') {
dump<- substring(n, 2); # get the rest of the strings
args<- as.numeric(dump);
if (is.na(args))  {cat("Error! format= 'tieto3'. ", args); return();}
#if (!is.na(args)) {cat("parameter tied to ", args, "\n");}
paralist<-c(paralist, paste("par[",args,"]", sep=''));#cat("paralist= ",paralist, "\n");
} else if(substring(n, 1,1)=='@') { # formula
dump<- substring(n, 2); # get the rest of the strings
# if it's a formula, we do nothing now, just stick it into theparalist
paralist<-c(paralist, dump);#cat("paralist= ",paralist, "\n");
} else {
# the above should handle everything; we got here by error
cat("Error! Unrecognized input\n");
return();
}
}
out<-list(input=str, paralist=paralist, initpara=initpara, upper=upperpara, lower=lowerpara);
return(out)
}


# test
# str<-"@206-counter*3, 7.5, @206-counter*3, ?4/6/3, 0.9, 95, ?0.05|0.2|0.01"
# parsepara(str)

Tuesday, September 28, 2010

log-logistic mixture project started

Possibly world's most boring picture, but nonetheless a start.

Today I posted the R source code for Log-logistic Mixture model on Google Code

http://code.google.com/p/loglogisticmixture/

This is one of the graphs it generates -- a simulated dataset of 500 data points from 3 loglogistic distributions. Estimated using the Maximum Likelihood method. Not too bad.


Let's begin.