**Title**: Learning the parameters to a mixture of normal distributions

**Created**: June 3, 2019

**By**: John Urban

**For**: Anyone interested.

Originally shared as google document:

https://docs.google.com/document/d/1GPSaVFR0HRmKOdDUNTRfngpWFS0jRo6018rh1Kjgzvc/edit?usp=sharing

Summary: Learning the parameters to a mixture of normal distributions.

Oftentimes you will see two or more modes in your data.

This means your data is a mixture of normal distributions (assuming both are normal).

That means if you compute parameters such as the mean on the data, you are getting it wrong for both/all distributions present.

You need to estimate the parameters for each distribution

Below shows

the R library and function to use to do this

4 use cases

How to randomly generate data with the learned parameters to see if it recapitulates the shape of your dataset well

###### COPY/PASTE THE FOLLOWING INTO AN RSCRIPT IN RSTUDIO

###### TO PLAY AROUND WITH

library(mixtools)

?normalmixEM

## Test 1

n <- 3000

m1 <- round(n*0.333)

m2 <- n-m1

mu1 <- 5

sd1 <- 1

mu2 <- 10

sd2 <- 2

## Test 1

n <- 10000

m1 <- round(n*0.5)

m2 <- n-m1

mu1 <- -0.8

sd1 <- 0.2

mu2 <- -0.55

sd2 <- 0.15

## Test 3

n <- 200

m1 <- round(n*0.05)

m2 <- n-m1

mu1 <- -0.99

sd1 <- 0.2

mu2 <- -0.15

sd2 <- 0.15

## Test 3

n <- 100

m1 <- round(n*0.01)

m2 <- n-m1

mu1 <- 1.0

sd1 <- 0.2

mu2 <- 1.1

sd2 <- 0.2

a <- c(rnorm(n=m1, mean=mu1, sd=sd1), rnorm(n=m2, mean=mu2, sd=sd2))

d <- density(a)

plot(d)

## for extremely biased proportions and/or small diffs in mean -- need very many iters (and maybe smaller epsilon if it will ever "converge")

## but be careful -- the given answer may be incorrect

## if you already know the mean sig of 1 dist though -- you can do pretty well

## 1. estimate both mu and sig (and proportions)

out <- normalmixEM(a, k=2, epsilon = 1e-200, maxit = 1000)

summary(out)

plot(d)

me1 <- round(n*out$lambda[1])

me2 <- n-me1

b <- c(rnorm(n=me1, mean=out$mu[1], sd=out$sigma[1]), rnorm(n=me2, mean=out$mu[2], sd=out$sigma[2]))

lines(density(b), col='red')

## 2. constrain 1 set of mu/sig that you already know, solve for unknown mu sig (and proportions)

out <- normalmixEM(a, k=2, epsilon = 1e-200, mean.constr = c(mu1,NA), sd.constr = c(sd1,NA))

summary(out)

plot(d)

me1 <- round(n*out$lambda[1])

me2 <- n-me1

b <- c(rnorm(n=me1, mean=out$mu[1], sd=out$sigma[1]), rnorm(n=me2, mean=out$mu[2], sd=out$sigma[2]))

lines(density(b), col='red')

## KNOWING even just 1 set gives a LOT of power

## 3. constrain both sets of mu/sig, solve for unknown proportions only

out <- normalmixEM(a, k=2, epsilon = 1e-200, mean.constr = c(mu1,mu2), sd.constr = c(sd1,sd2))

summary(out)

plot(d)

me1 <- round(n*out$lambda[1])

me2 <- n-me1

b <- c(rnorm(n=me1, mean=out$mu[1], sd=out$sigma[1]), rnorm(n=me2, mean=out$mu[2], sd=out$sigma[2]))

lines(density(b), col='red')

## 4. Given all parameters, compute likelihood.... need explicitly give mus, sigmas, and lambdas and set maxit to 0

## ...not just mean.constr -- but give mus as same vector as mean.constr

## mus are changing...

out <- normalmixEM(a, lambda=c(m1/n, m2/n), mu=c(mu1,mu2), sigma=c(sd1,sd2), k=2, epsilon = 1e-200, mean.constr = c(mu1,mu2), sd.constr = c(sd1,sd2), verb=TRUE, maxit = -1)

summary(out)

plot(d)

me1 <- round(n*out$lambda[1])

me2 <- n-me1

b <- c(rnorm(n=me1, mean=out$mu[1], sd=out$sigma[1]), rnorm(n=me2, mean=out$mu[2], sd=out$sigma[2]))

lines(density(b), col='red')

## No comments:

## Post a Comment