Modern Statistical Methods for Astronomy With R Applications Eric D. Feigelson & G. Jogesh Babu R scripts These scripts may be cut and pasted into any R console. They are self-contained and should reproduce the figures and results in the book. ****************************************************************** ****************************************************************** Chapter. 2 Probability # Set up 6 panel figure par(mfrow=c(3,2)) # Plot upper left panel with three illustrative exponential p.d.f. distributions xdens <- seq(0,5,0.02) plot(xdens,dexp(xdens,rate=0.5), type='l', ylim=c(0,1.5), xlab='', ylab='Exponential p.d.f.',lty=1) lines(xdens,dexp(xdens,rate=1), type='l', lty=2) lines(xdens,dexp(xdens,rate=1.5), type='l', lty=3) legend(2, 1.45, lty=1, substitute(lambda==0.5), box.lty=0) legend(2, 1.30, lty=2, substitute(lambda==1.0), box.lty=0) legend(2, 1.15, lty=3, substitute(lambda==1.5), box.lty=0) # Help files to learn these function help(seq) ; help(plot); help(par) ; help(lines); help(legend) # Plot upper right panel with three illustrative exponential c.d.f. distributions plot(xdens, pexp(xdens,rate=0.5), type='l', ylim=c(0,1.0), xlab='', ylab='Exponential c.d.f.', lty=1) lines(xdens, pexp(xdens,rate=1), type='l', lty=2) lines(xdens, pexp(xdens,rate=1.5),type='l',lty=3) legend(3, 0.50, lty=1, substitute(lambda==0.5), box.lty=0) legend(3, 0.38, lty=2, substitute(lambda==1.0), box.lty=0) legend(3, 0.26, lty=3, substitute(lambda==1.5), box.lty=0) # Plot middle panels with illustrative normal p.d.f. and c.d.f. xdens <- seq(-5, 5, 0.02) ylabdnorm <- expression(phi[mu~sigma^2] (x)) plot(xdens, dnorm(xdens, sd=sqrt(0.2)), type='l', ylim=c(0,1.0), xlab='', ylab=ylabdnorm,lty=1) lines(xdens,dnorm(xdens, sd=sqrt(1.0)), type='l', lty=2) lines(xdens,dnorm(xdens, sd=sqrt(5.0)), type='l', lty=3) lines(xdens,dnorm(xdens, mean=-2.0, sd=sqrt(0.5)), type='l', lty=4) leg1 <- expression(mu^' '==0, mu^' '==0, mu^' '==0, mu^' '==-2) leg2 <- expression(sigma^2==0.2, sigma^2==1.0, sigma^2==5.0, sigma^2==0.5,) legend(0.5, 1.0, lty=1:4, leg1, lwd=2, box.lty=0) legend(3.0, 1.01, leg2, box.lty=0) ylabpnorm <- expression(Phi[mu~sigma^2] (x)) plot(xdens,pnorm(xdens,sd=sqrt(0.2)), type='l', ylim=c(0,1.0), xlab='', ylab=ylabpnorm,lty=1) lines(xdens,pnorm(xdens, sd=sqrt(1.0)), type='l', lty=2) lines(xdens,pnorm(xdens, sd=sqrt(5.0)), type='l', lty=3) lines(xdens,pnorm(xdens, mean=-2.0, sd=sqrt(0.5)), type='l', lty=4) leg1 <- expression(mu^' '==0, mu^' '==0, mu^' '==0, mu^' '==-2) leg2 <- expression(sigma^2==0.2, sigma^2==1.0, sigma^2==5.0, sigma^2==0.5,) legend(0.5, 0.6, lty=1:4, leg1, lwd=2, box.lty=0) legend(3.0, 0.61, leg2, box.lty=0) # Plot bottom panels with illustrative lognormal p.d.f. and c.d.f. xdens <- seq(0,3, 0.02) plot(xdens, dlnorm(xdens, meanlog=0, sdlog=5), type='l', ylim=c(0,2), xlab='', ylab='Lognormal density', lty=1) lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1), type='l', lty=2) lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1/2), type='l', lty=3) lines(xdens, dlnorm(xdens, meanlog=0, sdlog=1/8), type='l', lty=4) leg1 <- expression(sigma==5, sigma==1, sigma==1/2, sigma==1/8) legend(1.8,1.8,lty=1:4,leg1,box.lty=0) plot(xdens,plnorm(xdens,meanlog=0,sdlog=5),type='l',ylim=c(0,1),xlab='x', ylab='Lognormal distribution',lty=1) lines(xdens,plnorm(xdens,meanlog=0,sdlog=1),type='l',lty=2) lines(xdens,plnorm(xdens,meanlog=0,sdlog=1/2),type='l',lty=3) lines(xdens,plnorm(xdens,meanlog=0,sdlog=1/8),type='l',lty=4) leg1 <- expression(sigma==5,sigma==1,sigma==1/2,sigma==1/8) legend(1.5,0.6,lty=1:4,leg1,box.lty=0) # Return plot to single-panel format par(mfrow=c(1,1)) ****************************************************************** ****************************************************************** Chapter 4 Probability Distribution Functions # Define Pareto distributions for Salpeter IMF dpareto <- function(x, alpha=1.35, b=1) (alpha>0)*(b>0)^alpha / x^(alpha+1) ppareto <- function(x, alpha=1.35, b=1) (x > b)*(1-((b>0)/x)^(alpha>0)) qpareto <- function(u, alpha=1.35, b=1) (b>0)/(1-u)^(1/(alpha>0)) # 00]) lmidpts <- log(hx$mids[counts>0]) tmp1 <- lm(ldens~lmidpts) # plot(lmidpts, ldens) ; abline(tmp1, col=2) alpha.LS.hist <- -1 - as.numeric(tmp1$coef[2]) return(alpha.LS.hist) } alpha.LS.hist.wt <- function(x) { hx <- hist(x, nclass=20, plot=F) counts <- hx$counts ldens <- log(hx$density[counts>0]) lmidpts <- log(hx$mids[counts>0]) tmp2 <- lm(ldens~lmidpts, weights=counts[counts>0]) alpha.LS.hist.wt <- -1 - as.numeric(tmp2$coef[2]) return(alpha.LS.hist.wt) } six_alphas <- function(x) { out <- c(alpha.MLE(x), alpha.MVUE(x), alpha.mom(x), alpha.LS.EDF(x), alpha.LS.hist(x), alpha.LS.hist.wt(x)) return(out) } # Comparison of alpha estimators on simulated datasets # Construct nsim simulated datasets with npts data points and alpha=1.35 nsim=500 ; alpha_sim = NULL for(i in 1:nsim) { xtmp = rpareto(npts) alpha_sim = rbind(alpha_sim,six_alphas(xtmp)) } colnames(alpha_sim)=c('MLE','MVUE','Moments','LS_EDF','LS_hist', 'LS_hist_wt') # Compute mean integrated square error bias_sim = apply(alpha_sim,2,mean) - 1.35 var_sim = apply(alpha_sim,2,var) *(nsim-1)/nsim mise_sim = bias_sim^2 + var_sim rbind(bias_sim, var_sim, mise_sim) # Read Milky Way Galaxy and M 31 globular cluster K magnitudes GC_MWG <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/ GlobClus_MWG.dat',header=T) GC_M31 <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/ GlobClus_M31.dat',header=T) KGC_MWG <- GC_MWG[,2] ; KGC_M31 <- GC_M31[,2]-24.44 kseq <- seq(-15.0, -5.0, 0.25) # Fit normal distributions library(MASS) par(mfrow=c(1,2)) hist(KGC_MWG, breaks=kseq, ylim=c(0,10), main='', xlab='K mag', ylab='N',col=gray(0.5)) normfit_MWG <- fitdistr(KGC_MWG,'normal') ; normfit_MWG lines(kseq, dnorm(kseq, mean=normfit_MWG$estimate[[1]], sd=normfit_MWG$estimate[[2]]) * normfit_MWG$n*0.25,lwd=2) hist(KGC_M31, breaks=kseq, ylim=c(0,50), main='',xlab='K mag',ylab='N',) normfit_M31 <- fitdistr(KGC_M31,'normal') ; normfit_M31 lines(kseq,dnorm(kseq, mean=normfit_M31$estimate[[1]], sd=normfit_M31$estimate[[2]]) * normfit_M31$n*0.25, lwd=2) # Test for normality install.packages('nortest') ; library(nortest) install.packages('moments') ; library(moments) lillie.test(KGC_MWG) ; cvm.test(KGC_MWG); ad.test(KGC_MWG) lillie.test(KGC_M31) ; cvm.test(KGC_M31) ; ad.test(KGC_M31) pearson.test(KGC_M31) skewness(KGC_M31) ; agostino.test(KGC_M31) ; jarque.test(KGC_M31) kurtosis(KGC_M31) ; anscombe.test(KGC_M31) ; bonett.test(KGC_M31) shapiro.test(KGC_M31) ; sf.test (KGC_M31) # Test for multimodality library(diptest) dip(KGC_M31) ****************************************************************** ****************************************************************** Chapter 5. Nonparametric Statistics # Data plots and summaries for asteroid densities asteroids <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/ asteroid.dens.dat", header=T) astnames <- asteroids[,1] ; dens <- asteroids[,2] ; err <- asteroids[,3] dotchart(dens, labels=astnames, cex=0.9, xlab=expression(Density~(g/cm^3))) plot(dens, ylim=c(0,8), xlab="Asteroids", ylab=expression(Density~(g/cm^3)),pch=20) num <- seq(1,length(dens)) segments(num, dens+err, num, dens-err) # Boxplot and summary statistics for asteroid data boxplot(asteroids[,2:3], varwidth=T, notch=T, xlab="Asteroids", ylab="Density", pars=list(boxwex=0.3,boxlwd=1.5,whisklwd=1.5,staplelwd=1.5,outlwd=1.5, font=2)) summary(dens) mean(dens) ; sd(dens) median(dens) ; mad(dens) weighted.mean(dens,1/err^2) # Tests for normality shapiro.test(dens) install.packages('nortest') ; library('nortest') ad.test(dens) ; cvm.test(dens) ; lillie.test(dens) ; pearson.test(dens) install.packages('outliers') ; library('outliers') dixon.test(dens) ; chisq.out.test(dens) grubbs.test(dens) ; grubbs.test(dens,type=20) # Read magnitudes for Milky Way and M 31 globular clusters GC1 <- read.table("http://astrostatistics.psu.edu/MSMA//datasets/ GlobClus_MWG.dat",header=T) GC2 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/ GlobClus_M31.dat",header=T) K1 <- GC1[,2] ; K2 <- GC2[,2] summary(K1) ; summary(K2) # Three estimates of the distance modulus to M 31 DMmn <- mean(K2) - mean(K1) ; DMmn sigDMmn <- sqrt(var(K1)/length(K1) + var(K2)/length(K2)) ; sigDMmn DMmed <- median(K2) - median(K1) ; DMmed sigDMmed <- sqrt(mad(K1)^2/length(K1) + mad(K2)^2/length(K2)) ; sigDMmed wilcox.test(K2, K1, conf.int=T) # e.d.f., quantile and Q-Q plots for globular cluster magnitudes plot(ecdf(K1), cex.points=0, verticals=T, xlab="K (mag)", ylab="e.d.f.", main="") plot(ecdf(K2-24.90), cex.points=0, verticals=T, add=T) text(-7.5, 0.8, lab="MWG") ; text(-10.5,0.9, lab="M 31") par(mfrow=c(1,3)) qqplot(K1, K2-24.90, pch=20, cex.axis=1.3, cex.lab=1.3, xlab="MWG", ylab="M31 - 24.90", main="") qqnorm(K1, pch=20, cex.axis=1.3, cex.lab=1.3, main="") qqline(K1, lty=2, lwd=1.5) text(-2.5, -6, pos=4, cex=1.3, 'MWG normal QQ plot') qqnorm(K2-24.90, pch=20, cex.axis=1.3, cex.lab=1.3, main="") qqline(K2-24.90, lty=2, lwd=1.5) text(-3, -7.5, pos=4, cex=1.3, 'M31 normal QQ plot') par(mfrow=c(1,1)) # Plot e.d.f. with confidence bands install.packages('sfsmisc') ; library('sfsmisc') ecdf.ksCI(K1,ci.col='black') # Nonparametric tests for normality install.packages('nortest') ; library(nortest) cvm.test(K1) ; cvm.test(K2) ad.test(K1) ; ad.test(K2) # Nonparametric two-sample tests for Milky Way and M31 globular clusters ks.test(K1, K2-24.90) # with distance modulus offset removed wilcox.test(K1, K2-24.90) mood.test(K1, K2-24.90) install.packages(cramer) ; library(cramer) cramer.test(K1, K2-24.90) # Parametric two-sample tests pooled_var <- ((length(K1)-1)*var(K1) + (length(K2)-1)*var(K2)) / (length(K1)+length(K2)-2) mean_diff <- (mean(K1) - mean(K2-24.90)) / sqrt(pooled_var * (1/length(K1)+1/length(K2))) mean_diff ; pnorm(mean_diff) t.test(K1, (K2-24.90), var.eq=T) var.test(K1, K2-24.90) # Input star formation contingency tables jets <- matrix(c(9,2,5,5), nrow=2) ; jets evol <- matrix(c(21,1,2,0,16,14,42,1,61,90,179,2,17,4,5,5,22,95,126,10),nrow=4) evol # Test null hypotheses chisq.test(jets) ; fisher.test(jets) chisq.test(evol) fisher.test(evol[2:3,]) # Chamaeleon vs. Taurus sample fisher.test(evol[3:4,]) # Taurus vs. eta Cha sample # Association plot assocplot(evol,col=c("black","gray"),xlab='Regions',ylab='Evolutionary classes') ****************************************************************** ****************************************************************** Chapter 6. Density Estimation or Data Smoothing # Construct large and small samples of SDSS quasar redshifts and r-i colors qso <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/SDSS_QSO.dat",head=T) dim(qso) ; names(qso) ; summary(qso) ; attach(qso) z.all <- z ; z.200 <- z[1:200] r.i.all <- r_mag - i_mag ; r.i.200 <- r.i.all[1:200] sig.r.i.all <- sqrt(sig_r_mag^2 + sig_i_mag^2) ; sig.r.i.200 <- sig.r.i.all[1:200] # Plot histogram and quantile function par(mfrow=c(1,2)) hist(z.all, breaks='scott', main='', xlab='Redshift', col='black') plot(quantile(z.all, seq(1,100,1)/100, na.rm=T), pch=20, cex=0.5, xlab='Percentile', ylab='Redshift') par(mfrow=c(1,1)) # Constant kernel density estimator plot(density(z.all), bw=bw.nrd(z.all), main='', xlab='Redshift', lwd=2) # Adaptive kernel smoother install.packages("quantreg") ; library(quantreg) akern.zqso <- akj(z.all, z=seq(0,5,0.01), alpha=0.5) str(akern.zqso) plot.window(xlim=c(0,5), ylim=c(0,0.6)) plot(seq(0,5, 0.01), akern.zqso$dens, pch=20, cex=0.5, xlab="Redshift", ylab="Density") rug(sample(z.all, 500)) # Distribution with and without measurement errors aster <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/ asteroid_dens.dat', head=T) summary(aster) ; dim(aster) ; attach(aster) install.packages('decon') ; library(decon) par(mfrow=c(1,2)) plot(ecdf(Dens), main='', xlab=expression(Asteroid~density~~g/cm^3), ylab='c.d.f.', verticals=T, cex=0, lwd=1.5) x <- seq(0, 6, 0.02) cdf_sm <- DeconCdf(Dens,Err,x,bw=0.1) lines(cdf_sm, lwd=1.5) plot(DeconPdf(Dens,Err,x,bw=0.1), main='', xlab=expression(Asteroid~density~~ g/cm^3), ylab='p.d.f.', lwd=1.5, add=T) lines(density(Dens, adjust=1/2), lwd=1.5, lty=2) par(mfrow=c(1,1)) # Natural spline fit install.packages("pspline") ; library(pspline) fit=sm.spline(z.200, r.i.200) plot(z.200, r.i.200, pch=20, cex=0.7, xlab="Redshift", ylab="r-i (mag)") lines(fit$x, fit$y, lwd=2) # Natural spline fit weighted by the variances of the measurement errors fitw <- sm.spline(z.200, r.i.200, w=sig.r.i.200^2) lines(fitw$x, fitw$y, lty=2, lwd=2) # Bivariate Nadaraya-Watson regression estimator with bootstrap errors install.packages("np") ; library(np) bw.NW <- npregbw(z.200, r.i.200, regtype='lc', bwtype='fixed') npplot(bws=bw.NW, ylim=c(-0.25,0.7), plot.errors.method="bootstrap", plot.errors.bar='I', plot.errors.type='quantiles') points(z.200, r.i.200, pch=20, cex=0.7) # Local regression # 1. Read SDSS quasar sample, N=77,429. Clean bad photometry qso <- read.table("http://astrostatistics.psu.edu/datasets/SDSS_QSO.dat",head=T) q1 <- qso[qso[,10] < 0.3,] ; q1 <- q1[q1[,12]<0.3,] dim(q1) ; names(q1) ; summary(q1) r_i <- q1[,9] - q1[,11] ; z <- q1[,4] ; r <- q1[,9] # 2. Plot two-dimensional smoothed distribution install.packages('ash') ; library(ash) nbin <- c(500, 500) ; ab <- matrix(c(0.0,-0.5,5.5,2.), 2,2) bins <- bin2(cbind(z,r_i), ab, nbin) f <- ash2(bins, c(5,5)) ; attributes(f) f$z <- log10(f$z) image(f$x, f$y, f$z, col=gray(seq(0.5,0.2,by=-0.02)), zlim=c(-2.0,0.2), main='', xlab="Redshift", ylab="r-i") contour(f$x, f$y, f$z,levels=c(-1.0,-0.5,0.3,0.5), add=T) # 3. Construct loess local regression lines z1 <- z[order(z)] ; r_i1 <- r_i[order(z)] loct1 <- loess(r_i1~z1, span=0.1, data.frame(x=z1,y=r_i1)) summary(loct1) lines(z1, predict(loct1), lwd=2, col=2) z2 <- z1[z1>3.5]; r_i2 <- r_i1[z1>3.5] loct2 <- loess(r_i2~z2, span=0.2, data.frame(x=z2,y=r_i2)) lines(z2, predict(loct2), lwd=2, col=3) # 4. Save evenly-spaced loess fit to a file x1 <- seq(0.0, 2.5, by=0.02) ; x2 <- seq(2.52, 5.0, by=0.02) loctdat1 <- predict(loct1, data.frame(x=x1)) loctdat2=predict(loct2, data.frame(x=x2)) write(rbind(x1,loctdat1), sep=' ', ncol=2, file='qso.txt') write(rbind(x2,loctdat2), sep=' ', ncol=2, file='qso.txt', append=T) ****************************************************************** ****************************************************************** Chapter 7. Regression # Linear regression with heteroscedastic measurement errors # Construct and plot dataset of SDSS quasars with 1818)),c(11:14)] dim(qso1) ; summary(qso1) ; attach(qso1) sig_z_mag[sig_z_mag<0.01] <- 0.01 dev.new(2) plot(i_mag, z_mag, pch=20, cex=0.1, col=grey(0.5), xlim=c(18,21.5), ylim=c(17.5,22), xlab="SDSS i (mag)", ylab="SDSS z (mag)") for(i in 50:150) { lines(c(i_mag[i],i_mag[i]),c((z_mag[i]+sig_z_mag[i]), (z_mag[i]-sig_z_mag[i]))) lines(c((i_mag[i]+sig_i_mag[i]),(i_mag[i]-sig_i_mag[i])), c(z_mag[i],z_mag[i])) } # Ordinary least squares fit fit_ols <- lm(z_mag~i_mag) summary(fit_ols) confint(fit_ols, level=0.997) dev.set(2) ; abline(fit_ols$coef, lty=1 ,lwd=2) dev.new(3) ; par(mfrow=c(2,2)) plot(fit_ols, which=c(2:5), caption='', sub.caption='' ,pch=20, cex=0.3, cex.lab=1.3, cex.axis=1.3) par(mfrow=c(1,1)) # Weighted least squares fit fit_wt <- lm(z_mag~i_mag, x=T, weights=1/(sig_z_mag*sig_z_mag)) summary(fit_wt) dev.set(2) ; abline(fit_wt$coef,lty=2,lwd=2) # Generalized linear modeling fit_glm_gau <- glm(z_mag ~ i_mag) ; summary(fit_glm_gau) fit_glm_gam <- glm(z_mag ~ i_mag,family=Gamma) ; summary(fit_glm_gam) # Robust M-estimator library(MASS) fit_M <- rlm(z_mag~i_mag, method='M', weights=1/(sig_z_mag*sig_z_mag), wt.method='inv.var') summary(fit_M) length(which(fit_M$w<1.0)) dev.set(3) ; hist(fit_M$w, breaks=50, ylim=c(0,700), xlab='Robust regression weights', main='') aM <- fit_M$coef[[1]] ; bM <- fit_M$coef[[2]] dev.set(2) ; lines(c(18,22), c(aM+bM*18, aM+bM*22), lty=3, lwd=3) # Thiel-Sen regression line install.packages('zyp') ; library(zyp) fit_ts <- zyp.sen(z_mag~i_mag, qso1[1:8000,]) confint.zyp(fit_ts) dev.set(2) ; abline(fit_ts$coef, lty=4, lwd=2) # Linear quantile regression install.packages('quantreg') ; library(quantreg) fit_rq <- rq(z_mag~i_mag, tau=c(0.10,0.50,0.90)) print.summary.rq(fit_rq) par(mfrow=c(1,2)) plot(i_mag, z_mag, pch=20, cex=0.1, col=grey(0.5), xlim=c(18,22), ylim=c(17.5,22), xlab="SDSS i (mag)", ylab="SDSS z (mag)") for(j in 0:2) { aquant=fit_rq$coef[[2*j+1]] ; bquant=fit_rq$coef[[2*j+2]] lines(c(18,22),c((aquant+bquant*18),(aquant+bquant*22)),lwd=2) } # Nonlinear quantile regression fit_rqss.1 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.10) fit_rqss.5 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.50) fit_rqss.9 <- rqss(z_mag~qss(i_mag), data=qso1, tau=0.90) plot.rqss(fit_rqss.1, rug=F, ylim=c(17.5,22), titles='') points(i_mag, z_mag, cex=0.1, pch=20, col=grey(0.5)) plot.rqss(fit_rqss.1, shade=F, rug=F, add=T, titles='', lwd=2) plot.rqss(fit_rqss.5, shade=F, rug=F, add=T, titles='', lwd=2) plot.rqss(fit_rqss.9, shade=F, rug=F, add=T, titles='', lwd=2) par(mfrow=c(1,1)) # Fit Sersic function to NGC 4472 elliptical galaxy surface brightness profile NGC4472 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4472_profile.dat", header=T) attach(NGC4472) NGC4472.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 26, data=NGC4472, start=list(I.e=20., r.e=120.,n=4.), model=T, trace=T) summary(NGC4472.fit) deviance(NGC4472.fit) logLik(NGC4472.fit) # Plot NGC 4472 data and best-fit model plot(NGC4472.fit$model$radius, NGC4472.fit$model$surf_mag, pch=20, xlab="r (arcsec)", ylab=expression(mu ~~ (mag/sq.arcsec)), ylim=c(16,28), cex.lab=1.5, cex.axis=1.5) lines(NGC4472.fit$model$radius, fitted(NGC4472.fit)) # Fit and plot radial profiles of NGC 4406 and NGC 4451 NGC4406 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4406_profile.dat", header=T) attach(NGC4406) NGC4406.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 32, data=NGC4406, start=list(I.e=20., r.e=120.,n=4.), model=T, trace=T) summary(NGC4406.fit) points(NGC4406.fit$model$radius, NGC4406.fit$model$surf_mag, pch=3) lines(NGC4406.fit$model$radius, fitted(NGC4406_fit)) NGC4551 <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/NGC4551_profile.dat", header=T) attach(NGC4551) NGC4551.fit <- nls(surf_mag ~ -2.5*log10(I.e * 10^(-(0.868*n-0.142)* ((radius/r.e)^{1/n}-1))) + 26, data=NGC4551, start=list(I.e=20.,r.e=15.,n=4.), model=T, trace=T) summary(NGC4551.fit) points(NGC4551.fit$model$radius, NGC4551.fit$model$surf_mag, pch=5) lines(NGC4551.fit$model$radius, fitted(NGC4551_fit)) legend(500, 20, c("NGC 4472","NGC 4406", "NGC 4551"), pch=c(20,3,5)) # NGC 4472 analysis # Residual plot plot(NGC4472.fit$model$radius,residuals(NGC4472.fit), xlab="r (arcsec)", ylab="Residuals", pch=20, cex.lab=1.5, cex.axis=1.5) lines(supsmu(NGC4472.fit$model$radius, residuals(NGC4472.fit), span=0.05), lwd=2) # Test for normality of residuals qqnorm(residuals(NGC4472.fit) / summary(NGC4472.fit)$sigma) abline(a=0,b=1) shapiro.test(residuals(NGC4472.fit) / summary(NGC4472.fit)$sigma) # Bootstrap parameter estimates install.packages('nlstools') ; library(nlstools) NGC4472.boot <- nlsBoot(NGC4472.fit) summary(NGC4472.boot) curve(dnorm(x,m=5.95, sd=0.10)*58/5.95, xlim=c(5.6,6.4), ylim=c(0,50)) hist(NGC4472.boot$coefboot[,3], breaks=50, add=T) # not shown ****************************************************************** ****************************************************************** Chapter 8. Multivariate Analysis # Overview of globular cluster properties GC = read.table("http://astrostatistics.psu.edu/MSMA/datasets/GlobClus_prop.dat", header=T, fill=T) summary(GC) manyhist <- function(x) { par(mfrow=n2mfrow(dim(x)[2])) for (i in 1:dim(x)[2]) { name=names(x)[i] hist(x[,i], main='', breaks='FD', ylab='', xlab=name) }} par(mfrow=c(1,1)) manyhist(GC[,2:13]) # Univariate boxplots and normal quantile-quantile plots for two variables par(mfrow=c(1,4)) boxplot(GC[,8], main='', ylab=expression(Core~r~~ (pc)), pars=list(xlab='', cex.lab=1.3, cex.axis=1.3, pch=20)) qqnorm(GC[,8], main='', xlab='', ylab='', cex.lab=1.3, cex.axis=1.3, pch=20) boxplot(GC[,20], main='', ylab=expression(CSB ~~(mag/arcmin^2)), cex.lab=1.3, cex.axis=1.3, pars=list(xlab='', pch=20)) qqnorm(GC[,20], main='', xlab='', ylab='', cex.lab=1.3, cex.axis=1.3, pch=20) qqline(GC[,20]) par(mfrow=c(1,1)) # Prepare the data # 1. Remove objects with NA entries, remove labels dim(GC) ; GC1 <- na.omit(GC[,-1]) ; dim(GC1) # 2. Standardize variables GC2 <- scale(GC1) # 3. Separate locational and dynamical variables GCloc <- GC2[,+c(1:4)] GCdyn <- GC2[,-c(1:4)] # 4. Remove two bad outliers GCdyn1 <- GCdyn[-c(which.max(GCdyn[,4]), which.max(GCdyn[,11])),] GCloc1 <- GCloc[-c(which.max(GCdyn[,4]), which.max(GCdyn[,11])),] # Bivariate relationships cor(GCdyn1, method='kendall') var(GCdyn1) pairs(GCdyn1[,3:8],pch=20,cex=0.4) # Elaborated color pairs plot library(MASS) pairs(GCdyn1[,5:8],main='',labels=names(GCdyn1[5:8]), panel=function(x,y) { + abline(lsfit(x,y)$coef,lwd=2,col='deeppink2') + lines(lowess(x,y),lwd=3,col='blue3',lty=2) + points(x,y,pch=21,bg = c("red", "green3", "blue")) + rug(jitter(x,factor=3),side=1,col='lightcoral',ticksize=-.05) + rug(jitter(y,factor=3),side=2,col='cornflowerblue',ticksize=-.05) + contour(kde2d(x,y)$x, kde2d(x,y)$y, kde2d(x,y)$z, drawlabels=F,add=T, col='darkblue',nlevel=4) + }) # PCA for dynamical variables. PCdyn <- princomp(GCdyn1) plot(PCdyn, main='') summary(PCdyn) loadings(PCdyn) biplot(PCdyn, col='black', cex=c(0.6,1)) # Add principal component values into the data frame PCdat <- data.frame(names=row.names(GCdyn1), GCdyn1, PCdyn$scores[,1:4]) # Multiple regression to predict globular cluster central surface brightnesses attach(GCdyn) CSB_fit1 <- lm(Cent.surf.bright~.-Cent.surf.bright,data=GCdyn) ; CSB_fit1 CSB_fitt <- lm(Cent.surf.bright~.,data=GCdyn[,c(7:11,13)]) ; CSB_fit2 str(CSB_fit2) summary(CSB_fit2) sd(CSB_fit2$residuals) par(mfrow=c(2,1)) plot(CSB_fit2$fitted.values, Cent.surf.bright, pch=20) qqnorm(CSB_fit2$residuals, pch=20, main='') qqline(CSB_fit2$residuals) par(mfrow=c(1,1)) # MARS nonlinear regression install.packages('earth') ; library(earth) CSB_fit3 <- earth(Cent.surf.bright~.-Cent.surf.bright, data=GCdyn) ; CSB_fit3 sd(CSB_fit3$residuals) qqnorm(CSB_fit2$residuals) ; qqline(CSB_fit2$residuals) # Some multivariate display techniques: # interactive 3-dim scatter plot; 4-dim bubble plot; parallel coordinates plot library(rgl) open3d() ; plot3d(GCdyn1[,5:7]) snapshot3d(file='GlobClus3D.png') library(lattice) cloud(GCdyn1[,5]~GCdyn1[,6]*GCdyn[,7], screen=list(z=60,x=45,y=20), xlab='log.t.rad', ylab='log.rho.cen', zlab='conc', col=1, pch=1, cex=GCdyn1[,8]+1) parallel(~GCdyn1, col=c('darkred','darkgreen','orange','blue','black')) ****************************************************************** ****************************************************************** Chapter 9. Clustering, Classification and Data Mining # Color-magnitude diagram for low-redshift COMBO-17 galaxies COMBO_loz=read.table('http://astrostatistics.psu.edu/MSMA/datasets/COMBO17_lowz.dat', header=T, fill=T) dim(COMBO) ; names(COMBO) par(mfrow=c(1,2)) plot(COMBO_loz, pch=20, cex=0.5, xlim=c(-22,-7), ylim=c(-2,2.5), xlab=expression(M[B]~~(mag)), ylab=expression(M[280] - M[B]~~(mag)), main='') # Two-dimensional kernel-density estimator library(MASS) COMBO_loz_sm <- kde2d(COMBO_loz[,1], COMBO_loz[,2], h=c(1.6,0.4), lims = c(-22,-7,-2,2.5), n=500) image(COMBO_loz_sm, col=grey(13:0/15), xlab=expression(M[B]~~(mag)), ylab=expression(M[280] - M[B]~~(mag)), xlim=c(-22,-7), ylim=c(-2,2.5), xaxp=c(-20,-10,2)) par(mfrow=c(1,1)) # Standardize variables Mag_std <- scale(COMBO_loz[,1]) Color_std <- scale(COMBO_loz[,2]) COMBO_std <- cbind(Mag_std,Color_std) # Hierarchical clustering COMBO_dist <- dist(COMBO_std) COMBO_hc <- hclust(COMBO_dist, method='complete') COMBO_coph <- cophenetic(COMBO_hc) cor(COMBO_dist, COMBO_coph) # Cutting the tree at k=5 clusters plclust(COMBO_hc, label=F) COMBO_hc5a <- rect.hclust(COMBO_hc, k=5, border='black') str(COMBO_hc5a) COMBO_hc5b <- cutree(COMBO_hc, k=5) str(COMBO_hc5b) plot(COMBO_loz, pch=(COMBO_hc5b+19), cex=0.7, xlab=expression(M[B]~~(mag)), ylab=expression(M[280] - M[B]~~(mag)), main='') # Density-based clustering algorithm install.packages('fpc') ; library(fpc) COMBO_dbs <- dbscan(COMBO_std, eps=0.1, MinPts=5, method='raw') print.dbscan(COMBO_dbs) ; COMBO_dbs$cluster plot(COMBO_loz[COMBO_dbs$cluster==0,], pch=20, cex=0.7, xlab='M_B (mag)', ylab='M_280 - M_B (mag)') points(COMBO_loz[COMBO_dbs$cluster==2,], pch=2, cex=1.0) points(COMBO_loz[COMBO_dbs$cluster==1 | COMBO_dbs$cluster==3,], pch=1, cex=1.0) # r-band distribution of Sloan quasars SDSS_qso <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/SDSS_17K.dat", header=T) dim(SDSS_qso) ; summary(SDSS_qso) qso_r <- SDSS_qso[,5] ; n_qso <- length(qso_r) par(mfrow=c(1,2)) hist(qso_r, breaks=100, main='', xlim=c(16,26)) # Normal mixture model install.packages('mclust') ; library(mclust) fit.mm <- Mclust(qso_r,modelNames="V") plot(fit.mm, ylim=c(-67000,-65000)) str(fit.mm) fit.mm$parameters$mean sqrt(fit.mm$parameters$variance$sigmasq) fit.mm$parameters$pro par(mfrow=c(1,1)) plot(ecdf(qso_r), xlim=c(16,26), xlab=' r (mag)', main='') r.seq <- seq(10,30,0.02) sum.comp <- rep(0,length(r.seq)) for (i in 1:fit.mm$G) { sum.comp <- sum.comp + pnorm(r.seq, mean= fit.mm$parameters$mean[[i]], sd=sqrt(fit.mm$parameters$variance$sigmasq[[i]])) * fit.mm$parameters$pro[[i]] } lines(r.seq, sum.comp, lwd=3, lty=2) for (i in 1:fit.mm$G) {lines(r.seq, dnorm(r.seq, mean=fit2$parameters$mean[[i]], sd=sqrt(fit2$parameters$variance$sigmasq[[i]])) * fit2$parameters$pro[[i]]) } # Model-based clustering library(mclust) COMBO_mclus <- mclustBIC(COMBO_loz,modelNames='VVV') plot(COMBO_mclus, col='black') COMBO_sum_mclus <- summary.mclustBIC(COMBO_mclus,COMBO_loz,3) COMBO_sum_mclus$parameters ; COMBO_sum_mclus$classification COMBO_sum_mclus$z ; COMBO_sum_mclus$uncertainty plot(COMBO_loz, pch=(19+COMBO_sum_mclus$classification), cex=1.0, xlab='M_B (mag)', ylab='M_280 - M_B (mag)', main='COMBO-17 MVN model clustering (k=3)', cex.lab=1.3, cex.axis=1.3) # R script for constructing SDSS test and training datasets is given # in Appendix C. # Unsupervised k-means partitioning SDSS.kmean <- kmeans(SDSS_test,6) print(SDSS.kmean$centers) plot(SDSS_test[,1], SDSS_test[,2], pch=20, cex=0.3, col=gray(SDSS.kmean$cluster/7), xlab='u-g (mag)', ylab='g-r (mag)', xlim=c(-0.5,3), ylim=c(-0.6,1.5)) # Linear discriminant analysis library(MASS) SDSS_lda <- lda(SDSS_train[,1:4], as.factor(SDSS_train[,5])) SDSS_train_lda <- predict(SDSS_lda, SDSS_train[,1:4]) SDSS_test_lda <- predict(SDSS_lda, SDSS_test[,1:4]) par(mfrow=c(2,1)) plot(SDSS_test[,1],SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20, col=SDSS_test_lda$class, cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)') # k-nn classification install.packages('class') ; library(class) SDSS_knn4 <- knn(SDSS_train[,1:4], SDSS_test, as.factor(SDSS_train[,5]), k=4, prob=T) plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20, col=SDSS_knn4, cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)') # Validation of k-nn classification SDSS_train_lda <- lda(SDSS_train[,1:4], as.factor(SDSS_train[,5])) SDSS_train_knn4 <- knn(SDSS_train[,1:4], SDSS_train[,1:4], SDSS_train[,5],k=4) plot(jitter(as.numeric(SDSS_train_lda$class), factor=0.5), jitter(as.numeric (SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='LDA class', ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2)) plot(jitter(as.numeric(SDSS_train_knn4), factor=0.5), jitter(as.numeric (SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='k-nn class', ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2)) par(mfrow=c(1,1)) # Single layer neutral network options(size=100, maxit=1000) SDSS_nnet <- multinom(as.factor(SDSS_train[,5]) ~ SDSS_train[,1] + SDSS_train[,2] + SDSS_train[,3] + SDSS_train[,4], data=SDSS_train) SDSS_train_nnet <- predict(SDSS_nnet,SDSS_train[,1:4]) plot(jitter(as.numeric(SDSS_train_nnet), factor=0.5), jitter(as.numeric (SDSS_train[,5]), factor=0.5), pch=20, cex=0.5, xlab='nnet class', ylab='True class', xaxp=c(1,3,2),yaxp=c(1,3,2)) # Classification And Regression Tree model, prediction and validation library('rpart') SDSS_rpart_mod <- rpart(SDSS_train[,5] ~., data=SDSS_train[,1:4]) SDSS_rpart_test_pred <- predict(SDSS_rpart_mod, SDSS_test) SDSS_rpart_train_pred <- predict(SDSS_rpart_mod, SDSS_train) summary(SDSS_rpart_mod) ; str(SDSS_rpart_mod) plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20, col=round(SDSS_rpart_test_pred), cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)') plot(jitter(SDSS_rpart_train_pred, factor=5), jitter(SDSS_train[,5]), pch=20, cex=0.5, xlab='CART class', ylab='True class',yaxp=c(1,3,2)) plot(SDSS_rpart_mod, branch=0.5, margin=0.05) text(SDSS_rpart_mod, digits=3, use.n=T, cex=0.8) plotcp(SDSS_rpart_mod, lwd=2, cex.axis=1.3, cex.lab=1.3) # Support Vector Machine model, prediction and validation install.packages('e1071') ; library(e1071) SDSS_svm_mod <- svm(SDSS_train[,5] ~.,data=SDSS_train[,1:4],cost=100, gamma=1) summary(SDSS_svm_mod) ; str(SDSS_svm_mod) SDSS_svm_test_pred <- predict(SDSS_svm_mod, SDSS_test) SDSS_svm_train_pred <- predict(SDSS_svm_mod, SDSS_train) plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20, col=round(SDSS_svm_test_pred), cex=0.5, main='', xlab='u-g (mag)', ylab='g-r (mag)') plot(SDSS_svm_train_pred, jitter(SDSS_train[,5]), pch=20, cex=0.5, xlab='SVM class', ylab='True class', yaxp=c(1,3,2)) # Final SVM classification of the test set par(mfrow=c(1,3)) plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), col=round(SDSS_svm_test_pred), ylim=c(-0.7,1.8), pch=20, cex=0.5, main='', xlab='u-g (mag)',ylab='g-r (mag)') plot(SDSS_test[,2], SDSS_test[,3], xlim=c(-0.7,1.8), col=round(SDSS_svm_test_pred), ylim=c(-0.7,1.8), pch=20, cex=0.5, main='', xlab='g-r (mag)',ylab='r-i (mag)') plot(SDSS_test[,3], SDSS_test[,4], xlim=c(-0.7,1.8), col=round(SDSS_svm_test_pred), ylim=c(-1.1,1.3), pch=20, cex=0.5, main='', xlab='r-i (mag)',ylab='i-z (mag)') par(mfrow=c(1,1)) SDSS_test_svm_out <- cbind(SDSS[,6], SDSS[,7], SDSS_test, SDSS_svm_test_pred) names(SDSS_test_svm_out)[c(1,2,7)] <- c('R.A.', 'Dec', 'SVM Class') write.table(format(SDSS_test_svm_out), file='SDSS_test_svm.out',sep='\\t',quote=F) ****************************************************************** ****************************************************************** Chapter 10. Nondetections: Censored and Truncated Data # Construct sample of X-ray luminosities of low-redshift (z<0.3) Sloan quasars qso <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/SDSS_QSO.dat", head=T, fill=T) dim(qso) ; names(qso) ; summary(qso) qso.lowz <- qso[qso[,2]<0.3,] dim(qso.lowz) ; names(qso.lowz) ; summary(qso.lowz) attach(qso.lowz) # X-ray luminosities for detections and (approximately) for upper limits ROSAT[ROSAT < (-8.900)] <- NaN # X-ray nondetections Dist <- (3*10^{10})*z*(3.09*10^{24}) / (73*10^5) XLum <- 8.3*10^(-12)*10^(ROSAT) * (4*pi*Dist^2) XLum[is.na(ROSAT)] <- 1.*10^(-12)*runif(1)* (4*pi*Dist[is.na(ROSAT)]^2) par(mfrow=c(1,2)) plot(log10(XLum[!is.na(ROSAT)]), ylim=c(42.5,45), xlab='', ylab='log(Lx) erg/s', pch=20) plot(log10(XLum[is.na(ROSAT)]), ylim=c(42.5,45), xlab='', ylab='log(Lx) erg/s', pch=25) par(mfrow=c(1,1)) # Construct quasar Kaplan-Meier luminosity function and 95% confidence band library(survival) Xstatus <- seq(1,1,length.out=400) # 1 = detected Xstatus[is.na(ROSAT)] <- 0 # 0 = right-censored survobj <- Surv(-XLum, Xstatus) KM.XLF<- survfit(survobj~1, conf.int=0.95, conf.type='plain', conf.lower='modified') KM.output <- summary(survfit(survobj~1)) # Plot Kaplan-Meier estimator with confidence bands plot(log10(-KM.XLF$time), KM.XLF$surv, ylim=c(0,1), pch=20, cex=0.5, main='', xlab=expression(log*L[x]), ylab='Kaplan-Meier estimator') lines(log10(-KM.XLF$time), KM.XLF$upper, lty=2) lines(log10(-KM.XLF$time), KM.XLF$lower, lty=2) # Prepare data for survival two-sample tests # Construct subsamples for XLFs of radio detected vs. undetected quasars # Class = 1 are radio detections, Class = 2 are radio nondetections # X-ray luminosities are obtained from ROSAT counts radio.obs <- qso.lowz[qso.lowz[,15]>(-1),] attach(radio.obs) radio.class <- seq(1,1,length.out=370) for(i in 1:370) if(radio.obs[i,15]>0) radio.class[i]=2 ROSAT[ROSAT < (-8.900)] <- NaN Xstatus <- seq(1,1,length.out=370) Xstatus[is.na(ROSAT[1:370])] <- 0 Dist <- (3*10^{10})*z*(3.09*10^{24}) / (73*10^5) XLum <- seq(1,1,length.out=370) XLum[1:370] <- 8.3*10^(-12)*10^(ROSAT) * (4*pi*Dist^2) XLum[is.na(ROSAT[1:370])] <- 1.*10^(-12)*runif(1)* (4*pi*Dist[is.na(ROSAT)]^2) # Apply five two-sample tests for radio detected vs. nondetected subsamples survdiff(Surv(-XLum,Xstatus) ~ radio.class, rho=0) survdiff(Surv(-XLum,Xstatus) ~ radio.class, rho=1) install.packages('surv2sample') ; library(surv2sample) surv2.ks.out <- surv2.ks(Surv(-XLum,Xstatus), radio.class,approx='boot') surv2.ks.out # Read dataset on beryllium and lithium abundances in stars abun <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/censor_Be.dat', header=T) dim(abun) ; names(abun) ; attach(abun) # Boxplot of abundances for stars with and without planets install.packages('NADA') ; library(NADA) cen_Be <- seq(FALSE, FALSE, length=68) ; cen_Be[Ind_Be==0] <- TRUE cen_Li <- seq(FALSE, FALSE, length=68) ; cen_Li[Ind_Li==0] <- TRUE Type_factor <- as.factor(Type) par(mfrow=c(1,2)) cenboxplot(logN_Be, cen_Be, Type_factor, log=FALSE, ylab='log N(Be)', names=c('Planets','No planets'), boxwex=0.5, notch=TRUE, varwidth=TRUE, cex.axis=1.5, cex.lab=1.5, lwd=2) cenboxplot(logN_Li, cen_Li, Type_factor, log=FALSE, ylab='log N(Li)', names=c('Planets','No planets'), boxwex=0.5, notch=TRUE, varwidth=TRUE, cex.axis=1.5, cex.lab=1.5, lwd=2) par(mfrow=c(1,1)) # Test significance of possible lithium abundance effect logN_Li1 <- logN_Li[-c(1,23)] ; cen_Li1 <- cen_Li[-c(1,23)] Type_factor1 <- Type_factor[-c(1,23)] # remove NaN values cendiff(logN_Li1, cen_Li1, Type_factor1, rho=0) cendiff(logN_Li1, cen_Li1, Type_factor1,rho=1) # Reproduce Santos et al. (2002) plot of stellar beryllium vs. lithium abundance ind_det1 <- which(Ind_Li==1 & Ind_Be==1 & Type==1) # filled circles ind_det2 <- which(Ind_Li==1 & Ind_Be==1 & Type==2) # open circles ind_left1 <- which(Ind_Li==0 & Ind_Be==1 & Type==1) ind_left2 <- which(Ind_Li==0 & Ind_Be==1 & Type==2) ind_down1 <- which(Ind_Li==1 & Ind_Be==0 & Type==1) ind_down2 <- which(Ind_Li==1 & Ind_Be==0 & Type==2) ind_both1 <- which(Ind_Li==0 & Ind_Be==0 & Type==1) ind_both2 <- which(Ind_Li==0 & Ind_Be==0 & Type==2) plot(logN_Li[ind_det1], logN_Be[ind_det1], xlim=c(-0.6,3.5), ylim=c(-0.2,1.5), main="", xlab="log N(Li)", ylab="log N(Be)", pch=16, lwd=2) # plot detections points(logN_Li[ind_det2], logN_Be[ind_det2], pch=1) arrows(logN_Li[ind_left1], logN_Be[ind_left1], logN_Li[ind_left1]-0.2, logN_Be[ind_left1],length=0.1) # plot left arrows arrows(logN_Li[ind_left2], logN_Be[ind_left2], logN_Li[ind_left2]-0.2, logN_Be[ind_left2],length=0.1) points(logN_Li[ind_left1], logN_Be[ind_left1], pch=16) points(logN_Li[ind_left2], logN_Be[ind_left2], pch=1) arrows(logN_Li[ind_down1], logN_Be[ind_down1], logN_Li[ind_down1], logN_Be[ind_down1]-0.1, length=0.1) arrows(logN_Li[ind_down2], logN_Be[ind_down2], logN_Li[ind_down2], logN_Be[ind_down2]-0.1, length=0.1) points(logN_Li[ind_down1], logN_Be[ind_down1], pch=16) points(logN_Li[ind_down2], logN_Be[ind_down2], pch=1) arrows(logN_Li[ind_both1], logN_Be[ind_both1], logN_Li[ind_both1]-0.2, logN_Be[ind_both1], length=0.1) # plot double arrows arrows(logN_Li[ind_both1], logN_Be[ind_both1], logN_Li[ind_both1], logN_Be[ind_both1]-0.1,length=0.1) arrows(logN_Li[ind_both2], logN_Be[ind_both2], logN_Li[ind_both2]-0.2, logN_Be[ind_both2],length=0.1) arrows(logN_Li[ind_both2], logN_Be[ind_both2], logN_Li[ind_both2], logN_Be[ind_both2]-0.1,length=0.1) points(logN_Li[ind_both1], logN_Be[ind_both1], pch=16) points(logN_Li[ind_both2], logN_Be[ind_both2], pch=1) # Bivariate correlation and regression using Akritas-Thiel-Sen procedure logN_Li1 <- logN_Li[-c(1,23)] # remove two points with NaN entries Ind_Li1 <- Ind_Li[-c(1,23)] ; logN_Be1 <- logN_Be[-c(1,23)] Ind_Be1 <- Ind_Be[-c(1,23)] Li_cen <- seq(FALSE, FALSE, length=66) # construct censoring indicator variables Li_cen[which(Ind_Be1==0)] <- TRUE Be_cen=seq(FALSE, FALSE, length=66) Be_cen[which(Ind_Li1==0)] <- TRUE cenken_out <- cenken(logN_Be1, Be_cen, logN_Li1, Li_cen) abline(a=cenken_out$intercept, b=cenken_out$slope, lwd=2) # Construct a sample of bright nearby Hipparcos stars hip <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/HIP1.tsv', header=T, fill=T) attach(hip) ; dim(hip); summary(hip) # Plot luminosity distribution of stars and their truncation limits AbsMag <- Vmag + 5*log10(Plx/1000) + 5 Lum <- 2.512^(4.84 - AbsMag) plot(density(log10(Lum)),ylim=c(0,1.7), main='', xlab='log L (solar, V band)', lwd=2, cex.lab=1.2) AbsMaglim <- 10.5 + 5*log10(Plx/1000) + 5 Lumlim <- 2.512^(4.84 - AbsMaglim) lines(density(log10(Lumlim)), lty=2, lwd=2) text(0.7, 0.5, 'Hipparcos sample', pos=4, font=2) text(-0.5, 1.2, 'Truncation limits', pos=4, font=2) # Compute Lynden-Bell-Woodroofe estimator with 90% confidence bands install.packages('DTDA') ; library(DTDA) LBW.hip <- efron.petrosian(log10(Lum), log10(Lumlim), boot=T, B=100, alpha=0.1) summary(LBW.hip) plot(LBW.hip$time, LBW.hip$survival, pch=20, cex=0.6, main='', xlab='log L (solar, V band)', ylab='Density', cex.lab=1.2) upper <- LBW.hip$upper.Sob[-(1000:1013)] lower <- LBW.hip$lower.Sob[-(1000:1013)] lines(LBW.hip$time,upper, lty=2) ; lines(LBW.hip$time, lower, lty=2) # Compare with observed Wielen 1983 local star LF Wielen.MV <- seq(0, 12, 1) Wielen.LF <- c(35,126,209,380,676,955,1050,891,1120,1410,2140,2510,4470) points(log10(2.512^(4.84-Wielen.MV)), Wielen.LF/2500) # Compare with binned 1/V.max luminosity function (Schmidt 1968) Vol.max <- (4*pi/3)*(1000/Plx)^3 bin.sum <- function(vol) sum(1/vol) Lum.bins <- cut(log10(Lum), breaks=seq(-2.5,3.0,0.5), ord=T) Schmidt.LF <- by(Lum, Lum.bins, bin.sum) lines(seq(-2.25, 2.75, 0.5), Schmidt.LF/4500, pch=3, lwd=2) ****************************************************************** ****************************************************************** Chapter 11. Time Series Analysis # Read in GX 5-1 data and create time series GX.dat <- scan("http://astrostatistics.psu.edu/MSMA/datasets/GX.dat") GX.time <- seq(from=0, to=512, length.out=65536) GX.ts <- ts(GX.dat, GX.time) ; GX.ts.offset <- ts(GX.dat-30, GX.time) # Compare histogram of counts to normal distribution hist(GX.dat, breaks=100, xlim=c(40,100), ylim=c(0,3500), xlab='GX 5-1 counts', font=2, font.lab=2, main='') curve(dnorm(x,mean=mean(GX.dat), sd=sqrt(mean(GX.dat)))*65536, lwd=3, add=T) sd(GX.dat) / sqrt(mean(GX.dat)) # result is 1.24 # Examine raw and smoothed time series plot.ts(GX.ts[1:6000], ylab='GX 5-1 counts', xlab='Time (1/128 sec)', cex.lab=1.3, cex.axis=1.3) plot(GX.time,GX.dat, ylim=c(-10,115), xlab='Time (sec)', ylab="GX 5-1 counts", cex.lab=1.3, cex.axis=1.3, type='n') lines(ksmooth(GX.time, GX.dat+30, 'normal', bandwidth=7), lwd=2) text(450, 110, 'Normal kernel') lines(filter(GX.ts, sides=2, rep(1,7)/7), lwd=2) text(450, 85, 'Moving average') lines(kernapply(GX.ts.offset, kernel('modified.daniell', 7)), lwd=2) text(450, 50, 'Modified daniell') lines(supsmu(GX.time, GX.dat-60), lwd=2) text(400, 20, "Friedman's supersmoother") lines(lowess(GX.time, GX.dat-80, 0.05), lwd=2) text(450, 0, "Local regression") # Raw periodogram f <- 0:32768/65536 I <- (4/65536) * abs(fft(GX.ts) / sqrt(65536))^2 plot(f[2:60000], I[2:60000], type="l", ylab="Power", xlab="Frequency") Pergram <- spec.pgram(GX.ts,log='no',main='') summary(Pergram) Pergram$spec[500] # value of 500th point 2*Pergram$spec[500] / qchisq(c(0.025,0.975),2) # Raw and smoothed periodogram par(mfrow=c(3,1)) spec.pgram(GX.ts, log='no', main='', sub='') spec.pgram(GX.ts, spans=50, log='no', main='', sub='') spec.pgram(GX.ts, spans=500, log='no', main='', sub='') # Autocorrelation functions of the GX 5-1 time series par(mfrow=c(1,2)) acf(GX.ts, 40, main='', ci.col='black', ylim=c(-0.05,0.3), lwd=2) pacf(GX.ts, 40, main='', ci.col='black', ylim=c(-0.05,0.3), lwd=2) # Autoregressive modeling ARmod <- ar(GX.ts, method='ols') ARmod$order # model selection based on AIC ARmod$ar # best-fit parameter values ARmod$asy.se.coef$ar # parameter standard errors plot(0:29, log10(ARmod$aic[1:30]), xlab='AR model parameters', ylab='log(AIC)', pch=20) arrows(27, 0.4, 27, 0.0, length=0.1) # Spectrum of AR model ARspec <- spec.ar(GX.ts, plot=F) GXspec <- spec.pgram(GX.ts, span=101, main='', sub='', lwd=2) lines(ARspec$freq, ARspec$spec, col='red', lwd=4) legend(0.23, 550, c('Periodogram, Daniell smooth', 'AR(27) model'), lty=c(1,1), lwd=c(2,4), col=c('black','red')) # Spectral estimates of the long-range memory parameter d install.packages('fracdiff') ; library(fracdiff) d.FARIMA <- fracdiff(GX.ts, nar=27, nma=1, ar=ARmod$ar) ; d.FARIMA$d d.GPH <- fdGPH(GX.ts) d.Reisen <- fdSperio(GX.ts) # Time domain estimates of the long-range memory parameter H=d+1/2 install.packages('fractal') ; library(fractal) d.DFA <- DFA(GX.ts) ; d.DFA plot(d.DFA) d.ACVF <- hurstACVF(GX.ts) ; d.ACVF d.block <- hurstBlock(GX.ts) ; d.block # Discrete wavelet transform install.packages('waveslim') ; library(waveslim) GX.wav <- dwt(GX.ts,n.levels=10) par(mfrow=c(3,1)) plot.ts(up.sample(GX.wav[[4]],2^4),type='h',axes=F,ylab='') ; abline(h=0) plot.ts(up.sample(GX.wav[[7]],2^7),type='h',axes=F,ylab='',lwd=2) ; abline(h=0) plot.ts(up.sample(GX.wav[[10]],2^{10}),type='h',axes=F,ylab='',lwd=2) ; abline(h=0) par(mfrow=c(1,1)) ****************************************************************** ****************************************************************** Chapter 12. Spatial Point Processes # Construct three galaxy redshift datasets, plot using spatstat shap <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/Shapley_galaxy.dat', header=T, fill=T) attach(shap) ; dim(shap) ; summary(shap) shap.hi <- shap[(R.A. < 205) & (R.A. > 200) & (Dec. > -34) & (Dec. < -29) ,] shap.lo <- shap[(R.A. < 214) & (R.A. > 209) & (Dec. > -34) & (Dec. < -27) ,] shap.clus <- shap[(R.A. <204.5) & (R.A. > 200.4) & (Dec. > -32.5) & (Dec. < -31.0) & (Vel > 11000) & (Vel < 18000),] # Plot in 3-dimensions using rgl, plot and scatterplot3d install.packages('rgl') ; library(rgl) rgl.open() rgl.points(scale(shap[,1]), scale(shap[,2]), scale(shap[,4])) rgl.bbox() rgl.snapshot('Shapley.png') rgl.close() plot(shap.lo[,1], shap.lo[,2], cex=(scale(shap.lo[,4])+1.5)/2, xlab='Right Ascension (degrees)', ylab='Declination (degrees)') install.packages('scatterplot3d') ; library(scatterplot3d) scatterplot3d(shap.hi[,c(1,2,4)], pch=20, cex.symbols=0.7, type='p', angl=40, zlim=c(0,50000)) # Preparation of nearest neighbor lists for spdep analysis install.packages('spdep') ; library(spdep) shap.lo.mat <- as.matrix(shap.lo[,1:2]) nn.shap.lo <- knearneigh(shap.lo.mat, k=1) ; str(nn.shap.lo) nb.shap.lo <- knn2nb(nn.shap.lo) ; plot.nb(nb.shap.lo, shap.lo[,1:2]) nb.wt.shap.lo <- nb2listw(nb.shap.lo,style='B') ; summary(nb.wt.shap.lo) # Application of Moran's I and Geary's C statistics moran(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo), S0=Szero(nb.wt.shap.lo)) moran.test(shap.lo.mat[,1], nb.wt.shap.lo) moran.mc(shap.lo.mat[,1], nb.wt.shap.lo, nsim=10000) geary(shap.lo.mat[,1], nb.wt.shap.lo, n=length(nb.shap.lo), n1=length(nb.shap.lo)-1, S0=Szero(nb.wt.shap.lo)) geary.test(shap.lo.mat[,1], nb.wt.shap.lo) # Variogram analysis: gstat install.packages('gstat') ; library(gstat) shap.variog <- variogram(Vel~1, locations=~R.A.+Dec., data=shap) variog.mod1 <- vgm(7e+07, "Gau", 3.0,2e+07) variog.fit <- fit.variogram(shap.variog,variog.mod1) ; variog.fit plot(shap.variog,model <- variog.fit, col='black', pch=20, xlab='Distance (degree)', ylab="Semivariance (km/s*km/s)", lwd=2) # Variogram analysis: geoR install.packages('geoR') ; library(geoR) shap1 <- shap[-c(which(duplicated(shap[,1:2]))),] shap.geo <- as.geodata(shap1,coords.col=1:2, data.col=4) points.geodata(shap.geo,cex.min=0.2, cex.max=1.0, pt.div='quart', col='gray') plot.geodata(shap.geo, breaks=30) shap.vario <- variog(shap.geo, uvec=seq(0, 10, by=0.5)) plot(shap.vario,lwd=2, cex.lab=1.3, cex.axis=1.3, lty=1) shap.GRF1 <- likfit(shap.geo, ini.cov.pars=c(4e7,0.2)) summary.likGRF(shap.GRF1) lines.variomodel(shap.GRF1, lwd=2, lty=2) # Preparation for spatstat analysis install.packages('spatstat') ; library(spatstat) shap.lo.win <- owin(range(shap.lo[,1]), range(shap.lo[,2])) centroid.owin(shap.lo.win) ; area.owin(shap.lo.win) shap.lo.ppp <- as.ppp(shap.lo[,c(1,2,4)], shap.lo.win) # planar point pattern summary(shap.lo.ppp) plot(density(shap.lo.ppp,0.3), col=topo.colors(20), main='', xlab='R.A.', ylab='Dec.') plot(shap.lo.ppp, lwd=2, add=T) # K function for the Shapley low density region shap.lo.K <- Kest(shap.lo.ppp, correction='isotropic') shap.lo.K.bias <- Kest(shap.lo.ppp, correction='none') plot.fv(shap.lo.K, lwd=2, col='black', main='', xlab='r (degrees)', legend=F) plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F) # Draw envelope of 100 simulations of CSR process shap.lo.K.env <- envelope(shap.lo.ppp, fun=Kest, nsim=100, global=T) xx <- c(0, shap.lo.K.env$r, rev(shap.lo.K.env$r), 0) yy <- c(c(0, shap.lo.K.env$lo), rev(c(0,shap.lo.K.env$hi))) polygon(xx, yy, col='gray') plot.fv(shap.lo.K, lwd=2, col='black', main='', add=T, legend=F) plot.fv(shap.lo.K.bias, add=T, lty=3, lwd=2, col='black', legend=F) # Similar plot for the L* function shap.lo.L <- Lest(shap.lo.ppp, correction='isotropic') shap.lo.L.bias <- Lest(shap.lo.ppp, correction='none') plot(shap.lo.L$r, (shap.lo.L$iso - shap.lo.L$r), lwd=2, col='black', main='',xlab='r (degrees)', ylab='L*(r)', ty='l', ylim=c(-0.2,0.2)) lines(shap.lo.L$r, (shap.lo.L$theo - shap.lo.L$r), lwd=2, lty=2) lines(shap.lo.L$r, (shap.lo.L.bias$un - shap.lo.L$r), lwd=2, lty=3) # Baddeley J function for the Shapley low density region plot(Jest(shap.lo.ppp), lwd=2, col='black', cex.lab=1.3, cex.axis=1.3, main='', xlab='r (degrees)', legend=F) # Two-point correlation function shap.lo.pcf <- pcf(shap.lo.ppp) plot(shap.lo.pcf, xlim=c(0.0,0.2)) plot(log10(shap.lo.pcf$r[2:512]), log10(shap.lo.pcf$trans[2:512]), type='l', lwd=2, xlab='log r (degrees)', ylab='log pair correlation fn') lines(c(-1,0), c(0.78+0.48,0.48), lwd=2, lty=2) lines(c(-2,0), c(0,0), lwd=2, lty=3) # Compute Dirichlet (Voronoi) tessellation shap.lo.dir <- dirichlet(shap.lo.ppp) summary(shap.lo.dir) ; plot(shap.lo.dir, main='') shap.lo.tile <- tiles(shap.lo.dir) ; str(shap.lo.tile) shap.lo.area <- list(lapply(shap.lo.tile, area.owin)) ; str(shap.lo.area) # Select small area tiles as clusters hist(as.numeric(shap.lo.area[[1]]), breaks=30) shap.lo.clus <- cut(as.numeric(shap.lo.area[[1]]), breaks=c(0,0.06,1)) plot(shap.lo.dir, main='') points(shap.lo.ppp, pch=20, cex=0.5) points(shap.lo.ppp[shap.lo.clus=='(0,0.06]'], pch=1,lwd=2) # IDW spatial interpolation using the gstat package xrange <- c(193,216) ; yrange <- c(-38,-26.5) shap.grid <- expand.grid(x=seq(from=xrange[1], to=xrange[2], by=0.05), y <- seq(from=yrange[1], to=yrange[2], by=0.05)) names(shap.grid) <- c("R.A.","Dec.") gridded(shap.grid) <- ~R.A.+Dec. plot(shap.grid) ; points(shap[,1:2], pch=20) shap.idw <- idw(shap[,4]~1, locations=~R.A.+Dec., shap[,1:2], shap.grid, idp=1.5) sp.theme(regions = list(col = topo.colors(100)), set=T) spplot(shap.idw) # Ordinary kriging using the geoR package shap.vario <- variog(shap.geo, uvec=seq(0, 10, by=0.2)) plot(shap.vario, lwd=2, lty=1) shap.variofit <- variofit(shap.vario, cov.model='gaussian') lines(shap.variofit, lty=2) shap.grid <- pred_grid(c(193,217), c(-38,-29), by=0.3) KC <- krige.control(obj.model=shap.variofit) shap.okrig <- krige.conv(shap.geo,loc=shap.grid, krig=KC) image(shap.okrig, xlim=c(195,203), ylim=c(-38,-27)) points(shap.geo, cex.min=0.3, cex.max=1.5, add=T) image(shap.okrig, loc=shap.grid, val=sqrt(shap.okrig$krige.var), xlim=c(195,203), ylim=c(-38,-27), zlim=c(3800,5000)) points(shap.geo,cex.min=0.3, cex.max=1.5, add=T) # Preparation of circular datasets from Hipparcos proper motion catalog hip <- read.table("http://astrostatistics.psu.edu/MSMA/datasets/HIP.dat", header=T, fill=T) dim(hip) ; summary(hip) hist(hip[,8], breaks=50) hip1<- hip[hip[,8]<5,] ; hip2 = na.omit(hip1) ; hip3 = hip2 hip3[,6] <- hip2[,6] - median(hip2[,6]) hip3[,7] <- hip2[,7] - median(hip2[,7]) attach(hip3) ; dim(hip3) nstar <- length(hip3[,6]) const=360. / (2*pi) theta <- numeric(length=nstar) for (i in 1:nstar) { if(pmRA[i]>=0 & pmDE[i]>=0) theta[i] = atan(pmRA[i] / pmDE[i]) * const if(pmDE[i]<0) theta[i] = 180. + atan(pmRA[i] / pmDE[i]) * const if(pmRA[i]<0 & pmDE[i]>=0) theta[i] = 360. + atan(pmRA[i] / pmDE[i]) * const } hist(theta, breaks=20, lwd=2,xlab='Position angle theta (degrees)', main='') # Proper motion and circular plots install.packages('CircStats') ; library(CircStats) circ.summary(theta) circ.plot(theta, cex=0.3, pch=20, stack=T, bins=50, shrink=2) theta=theta / const plot(pmRA, pmDE, pch=20, cex=0.6, xlab='Proper motion R.A. (mas/yr)', ylab='Proper motion Dec. (mas/yr)',main='') abline(0, 0, lty=2, lwd=2) ; abline(0, 100000, lty=2, lwd=2) est.kappa(theta, bias=T) # Basic statistics, tests for uniformity, and von Mises tests install.packages('circular') ; library(circular) circ.summary(theta) sqrt(circ.disp(theta)[4]) kuiper(theta) ; r.test(theta); rao.spacing(theta) watson(theta) ; watson(theta,dist='vm') vm.ml(theta,bias=T) vm_boot <- vm.bootstrap.ci(theta,bias=T) vm_boot$mu.ci ; vm_boot$kappa.ci ****************************************************************** ****************************************************************** Appendix B. Getting Started with R # I. Query the session environment and make a vector of three numbers getwd() setwd('/Users/myself/newdir') sessionInfo() ; citation() a.test <- c(33, 44, 55) ls() ; class(a.test) ; str(a.test) a.test ; write(file='output', a.test) # II. Create a 500x2 bivariate dataset where the X values are evenly # distributed between 0 and 5 and the Y values have a nonlinear # dependence on X with heteroscedastic Gaussian scatter help(seq) ; help(sample) ; help(rnorm) set.seed(1) x <- sample(seq(0.01, 3, length.out=500)) y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2))) # III. Examine, summarize and boxplot univariate distributions summary(x) ; summary(y) str(summary(x)) write.table(rbind(summary(x),summary(y)),file='summary.txt') help(par) ; help(boxplot) par(mfrow=c(1,2)) # set up two-panel figure boxplot(x, notch=T, main='Boxplot for X') boxplot(y, notch=T, pch=20, cex=0.5, main='Boxplot for Y') par(mfrow=c(1,1)) dev.copy2eps(file='box.eps') plot(ecdf(x),pch=20,cex=0.3,main='',ylab='EDF',xlab='') plot(ecdf(y),pch=20,cex=0.3,add=T) text(2.0,0.5,"X") ; text(1.4,0.8,"Y") dev.copy2eps(file='edf.eps') # IV. Put X and Y into a data frame help(cbind) ; help(as.data.frame) ; help(names) xy <- cbind(x, y) ; str(xy) xy <- as.data.frame(xy) names(xy) <- c('Xvar', 'Yvar') ; str(xy) attach(xy) ; ls() # V. Bivariate plots and correlation tests help(plot) par(mfrow=c(2,2)) plot(xy, pch=20, cex=0.5) plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)') length(x[x>2.5]) cor.test(x[x>2.5],y[x>2.5], method='pearson') cor.test(x[x>2.5],y[x>2.5], method='kendall') # VI. Kernel density estimator with spline fit smoothScatter(log10(xy), lwd=4, pch=20, cex=0.2, colramp = colorRampPalette(c("white",gray(20:5/20)))) lines(smooth.spline(log10(xy)), lwd=3) # VII. Least-squares polynomial regression logx <- log10(x) ; logx2 <- (log10(x))^2 ; logx3 <- (log10(x))^3 logx4 <- (log10(x))^4 ; logx5 <- (log10(x))^5 yfit <- lm(log10(y) ~ logx + logx2 + logx3 + logx4 + logx5) str(yfit) plot(log10(x), log10(y), pch=20, col=grey(0.5),cex=0.3) lines(sort(log10(x)), yfit$fit[order(log10(x))], lwd=3) dev.copy2eps(file="smooth.eps") ****************************************************************** ****************************************************************** Appendix C. Astronomical Datasets # SDSS point sources test dataset, N=17,000 (mag<21, point sources, hi-qual) SDSS <- read.csv('http://astrostatistics.psu.edu/MSMA/datasets/SDSS_test.csv', h=T) dim(SDSS) ; summary(SDSS) SDSS_test <- data.frame(cbind((SDSS[,1]-SDSS[,2]), (SDSS[,2]-SDSS[,3]), (SDSS[,3]-SDSS[,4]), (SDSS[,4]-SDSS[,5]))) names(SDSS_test) <- c('u_g', 'g_r', 'r_i', 'i_z') str(SDSS_test) par(mfrow=c(1,3)) plot(SDSS_test[,1], SDSS_test[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8),pch=20, cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='u-g (mag)', ylab='g-r (mag)') plot(SDSS_test[,2], SDSS_test[,3], xlim=c(-0.7,1.8), ylim=c(-0.7,1.8), pch=20, cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='g-r (mag)', ylab='r-i (mag)') plot(SDSS_test[,3], SDSS_test[,4], xlim=c(-0.7,1.8), ylim=c(-1.1,1.3), pch=20, cex=0.6, cex.lab=1.5, cex.axis=1.5, main='', xlab='r-i (mag)', ylab='i-z (mag)') par(mfrow=c(1,1)) # Quasar training set, N=2000 (Class 1) qso1 <- read.table('http://astrostatistics.psu.edu/MSMA/datasets/SDSS_QSO.dat', h=T) dim(qso1) ; summary(qso1) bad_phot_qso <- which(qso1[,c(3,5,7,9,11)] > 21.0 | qso1[,3]==0) qso2 <- qso1[1:2000,-bad_phot_qso,] qso3 <- cbind((qso2[,3]-qso2[,5]), (qso2[,5]-qso2[,7]), (qso2[,7]-qso2[,9]), (qso2[,9]-qso2[,11])) qso_train <- data.frame(cbind(qso3, rep(1, length(qso3[,1])))) names(qso_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class') dim(qso_train) ; summary(qso_train) # Star training set, N=5000 (Class 2) temp2 <- read.csv('http://astrostatistics.psu.edu/MSMA/datasets/SDSS_stars.csv', h=T) dim(temp2) ; summary(temp2) star <- cbind((temp2[,1]-temp2[,2]), (temp2[,2]-temp2[,3]), (temp2[,3]-temp2[,4]), (temp2[,4]-temp2[,5])) star_train <- data.frame(cbind(star, rep(2, length(star[,1])))) names(star_train) <- c('u_g','g_r','r_i','i_z','Class') dim(star_train) ; summary(star_train) # White dwarf training set, N=2000 (Class 3) temp3 <- read.csv('http://astrostatistics.psu.edu/MSMA/datasets/SDSS_wd.csv', h=T) dim(temp3) ; summary(temp3) temp3 <- na.omit(temp3) wd <- cbind((temp3[1:2000,2]-temp3[1:2000,3]), (temp3[1:2000,3]-temp3[1:2000,4]), (temp3[1:2000,4]-temp3[1:2000,5]), (temp3[1:2000,5]-temp3[1:2000,6])) wd_train <- data.frame(cbind(wd, rep(3, length(wd[,1])))) names(wd_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class') dim(wd_train) ; summary(wd_train) # Combine and plot the training set (9000 objects) SDSS_train <- data.frame(rbind(qso_train, star_train, wd_train)) names(SDSS_train) <- c('u_g', 'g_r', 'r_i', 'i_z', 'Class') str(SDSS_train) par(mfrow=c(1,3)) plot(SDSS_train[,1], SDSS_train[,2], xlim=c(-0.7,3), ylim=c(-0.7,1.8), pch=20, col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='u-g (mag)', ylab='g-r (mag)') legend(-0.5, 1.7, c('QSO','MS + RG','WD'), pch=20, col=c('black','red','green'), cex=1.6) plot(SDSS_train[,2], SDSS_train[,3], xlim=c(-0.7,1.8), ylim=c(-0.7,1.8), pch=20, col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='g-r (mag)', ylab='r-i (mag)') plot(SDSS_train[,3], SDSS_train[,4], xlim=c(-0.7,1.8), ylim=c(-1.1,1.3), pch=20, col=SDSS_train[,5], cex=0.6, cex.lab=1.6, cex.axis=1.6, main='', xlab='r-i (mag)', ylab='i-z (mag)') par(mfrow=c(1,1))