LearningR 　　　　　　　　

01 Aug 2020

R Studio

import dataset with RStudio

parameters_for_analysis_agg <- read_excel("C:/zkk/repetition2/parameters for analysis.xlsx", sheet = "Agg NT-T") //读取Excel文件及特定Sheet

agg_d_seq <- parameters_for_analysis_agg["d_seq"] //选出特定名字的列
parameters_for_analysis_agg$d_seq //使用列中的数值
ntt_fundamental_max <- ntt_nbsfm[c("fundamental_max", "category")]//选出多个列

th_sfm$category <- as.integer(2) //插入新列并赋值
new_frame = cbind(data1, data2['bat']) //将右边的列插入左边的dataframe
colnames(ntt_nbsfm)[16] = 'bandw_max' //改变列名
colnames(ntt_sfm)[c(11, 12, 13, 14, 15, 16, 17, 18, 19, 20)] = c('peak_freq_max', 'peak_amp_max', 'fundamental_max', 'min_freq_max', 'max_freq_max', 'bandw_max', 'quart_25_max', 'quart_50_max', 'quart_75_max', 'entropy_max')

agg_sfm <- subset(parameters_for_analysis_agg, syl=="SFM") //选出特定值的行
data_during <- playback2017data[playback2017data$partition == 'during']//选出特定值的行2
data3 <- rbind(ntt_fundamental_max_del_zero, data.frame(fundamental_max=30200, category=2)) //插入新行

改变某一个值
data_during[1,3] = 100

install.packages(dplyr)
library(dplyr)
data2 = filter(data1, IP_CR_RSK_RTG_ID !='22140023')//选出特定值的行


y <- x[!is.na(x)]//选出非缺失值
ts[is.na(ts)]<-0    #将缺失值替换为0
Individual_consistency[Individual_consistency %in% 1] <- NA 将等于1的值替换为空值

改变特定行的值
th_del$group[th_del$emitter_agg == 'L'] <- 1

使用sql查询data.frame
install.packages('sqldf')
result <- sqldf("select * from dataSet where size='aaa' ")

leveneTest(y, group)

y是你想检测homogeneity of variance 的所有数据， group给的是这些变量的分组信息，所以后面一个变量会被当做factor处理

leveneTest(d_seq ~ as.factor(category), data=ntt_nbsfm)
//显著差异说明方差不齐次

for(i in 1:length(col_name)){ print(col_name[i]); new_frame <- ntt_sfm[c(col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- leveneTest(variable~as.factor(category), data=new_frame); print(result$F); print(result$Pr); }

for(i in 1:length(personality_col)){
  print(personality_col[i]);
  new_frame <- data[c(personality_col[i], "bat")];
  colnames(new_frame)[1] = "variable";
  result <- leveneTest(variable~bat, data=new_frame);
  print(result$F);
  print(result$Pr);
}

shapiro.test(ntt_nbsfm$d_seq)// 判断是否正态分布
//显著差异说明不正态
//数据中的0值会转换后出现NaN值,导致无法再检测正态与否

wilcox.test(subset(ntt_nbsfm, category==1)$d_seq, subset(ntt_nbsfm, category==2)$d_seq, exact = FALSE, correct = FALSE)//比较两组不正态的数据是否显著差异

//数据不正态分布而且超过两组数据的时候用kruskal.test(syl_num~category, data=data_frame)

length(ntt_nbsfm) //返回列数
ncol() //返回列数
nrow() //返回行数

formatR package

列索引从1开始

for(i in 1:length(ntt_nbsfm)){
    print(colnames(ntt_nbsfm)[i])
    if(colnames(ntt_nbsfm) !== 'date' && colnames(ntt_nbsfm) !== 'file' && colnames(ntt_nbsfm) !== 'syl'){
    	leveneTest(colnames(ntt_nbsfm)[i]~as.factor(category), data=ntt_nbsfm)
  	}
}
for(i in 1:length(col_name)){ print(col_name[i]); new_frame <- data.frame();new_frame <- ntt_sfm[c(col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- leveneTest(variable~as.factor(category), data=new_frame); print(paste(result$F)); #print(paste('Pr: ', result$Pr)); }

list(d_second, fundamental, energy, peak_freq_max, peak_amp_max, fundamental_max, min_freq_max, max_freq_max, bandw_max, quart_25_max, quart_50_max, quart_75_max, entropy_max, interval, category)

bnbl2_col_name <- c("d_seq", "d_first","num_sub", "fundamental", "energy", "peak_freq_max", "peak_ampl_max", "fundamental_max", "min_freq_max", "max_freq_max", "bandw_max", "quart_25_max", "quart_50_max", "quart_75_max", "entropy_max", "interval");
nbsfm2_col_name <- c("d_second", "d_nb", "num_sub", "fundamental", "energy", "peak_freq_max", "peak_ampl_max", "fundamental_max", "min_freq_max", "max_freq_max", "bandw_max", "quart_25_max", "quart_50_max", "quart_75_max", "entropy_max")

leveneTest()

library(car)

for(i in 1:length(nbsfm2_col_name)){ print(nbsfm2_col_name[i]); new_frame <- nnt2_nbsfm[c(nbsfm2_col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- leveneTest(variable~as.factor(category), data=new_frame); print(result$F); print(result$Pr); }

shapiro.test()

for(i in 1:length(nbsfm2_col_name)){ print(nbsfm2_col_name[i]); new_frame <- data.frame();new_frame <- nnt2_nbsfm[c(nbsfm2_col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- shapiro.test(new_frame$variable); print(result$statistic); print(result$p.value) }

wilcox.test()

for(i in 1:length(nbsfm2_col_name)){ print(nbsfm2_col_name[i]); new_frame <- data.frame();new_frame <- nnt2_nbsfm[c(nbsfm2_col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- wilcox.test(subset(new_frame, category==1)$variable, subset(new_frame, category==2)$variable, exact = FALSE, correct = FALSE); print(result$statistic); print(result$p.value) }

aov()

for(i in 1:length(nbsfm2_col_name)){ print(nbsfm2_col_name[i]); new_frame <- data.frame();new_frame <- nnt2_nbsfm[c(nbsfm2_col_name[i], "category")]; colnames(new_frame)[1] = "variable"; result <- aov(variable~as.factor(category), data=new_frame); print(summary(result)) }

ggplot2

geom: geometric object 几何属性, 包括点线条形

aes: aesthetic attributes 图形属性, 包括颜色形状大小 scale

stats: statistical transformation 统计变换

coord: coordinate system 坐标系

facet: facet 分面, 指将绘图窗口划分为若干个子窗口

qplot()

quick plot 快速作图

默认是散点图
qplot(x, y, data = dataSet, colour=column_name, shape=column_name, alpha=I(1/10), geom="smooth|boxplot|path|line|histogram|freqpoly|density|bar|jitter")
//散点图中点的颜色和形状,透明度为原来的1/10

size: 调节线条粗细
geom=c("boxplot", "smooth")//可以组合使用

ggplot()

library(ggplot2)
// 去除灰色背景、网格线、图例、刻度
+theme_bw()+theme(
	panel.grid.major = element_line(colour = NA),
	panel.grid.minor = element_line(colour = NA),
	legend.position = 'none',
	axis.text.x = element_blank()
)
//去除刻度线
p + theme(axis.ticks.y = element_blank(), axis.text.y = element_blank())

//默认的两个颜色
"#669933", "#FFCC66"

//两组柱状图
ggplot(syllable, aes(fill=Context, Syllable_type, Percent)) + geom_bar(position = 'dodge', stat = 'identity')

//按照fill=的字段分组
ggplot(data_during_nbdfm_del_zero, aes(fill=behaviors, playback_file, number)) + geom_boxplot()

//按照facet_grid()中的字段分面
ggplot(data_during_nbdfm_del_zero, aes(fill=behaviors, playback_file, number)) + geom_boxplot() + facet_grid(. ~ behaviors)

可以将数据取log值,纵坐标差距比较大的数据做出来的图更好看一些

//将图竖起来 翻转90度
+coord_flip()

// agg th potision
> position_data$syllable <- with(position_data, reorder(syllable,number))

> ggplot(position_agg_th_8_nt)+geom_bar(aes(x=syllable, y=number/sum(number),fill=context),stat = 'identity')+facet_wrap(context~position)+theme(axis.text.x = element_text(angle = 90), legend.position='none')+labs(x='Syllable type', y='Percent')

// agg th syllable
> ggplot(syl)+geom_bar(aes(x=Syllable_type, y=Percent, fill=Context), stat = 'identity',position = 'dodge')+scale_x_discrete(limits=c('SNU','SNSNS','NSNSND','SNSND','NSN', 'DFM','SN','AFM','BNBs','NB-UFM','UFM','NSNS','SNS','NSND','SND','BNBl','SFM','NB-DFM','NB-SFM'))+labs(x='Syllable type')+theme(legend.position = c(.7,.9), axis.text.x = element_text(angle = 90))

> nt_syllable$syllable <- with(nt_syllable, reorder(syllable,number))
> ggplot(nt_syllable)+geom_bar(aes(x=syllable, y=number, fill=context), stat = 'identity',position = 'dodge')+labs(x='Syllable type', y='Percent')+theme(legend.position = c(.7,.9), axis.text.x = element_text(angle = 90))

判别分析 LDA

# 数据准备，使用R内置数据集iris
# 通过抽样建立训练样本(70%)和测试样本(30%)
> index <- sample(2,size = nrow(iris),replace = TRUE, prob = c(0.7,0.3))
> train_data <- iris[index == 1,]
> test_data <- iris[index == 2,]
# 载入所用包
> library(MASS)
# 构建Fisher判别模型
> fisher_model <- lda(Species~., data = train_data)
# 进行预测
> fisher_model_pre <- predict(fisher_model, newdata = test_data[,1:4])
# 生成实际与预判交叉表
> table(test_data$Species,fisher_model_pre$class)

             setosa versicolor virginica
  setosa         20          0         0
  versicolor      0         14         1
  virginica       0          0        18
# 生成预判精度
> sum(diag(table(test_data$Species,fisher_model_pre$class)))
+ / sum(table(test_data$Species,fisher_model_pre$class))
[1] 0.9811321

agg_model <- lda(emitter~., data=Individual_consistency, prior=c(1,1,1,1,1,1,1,1)/8)
table(emitter, predict(agg_model)$class)

#作图
plot(agg_model)

ggplot(syllable[syllable$Context=='Aggressive']) + aes(x=Syllable_type, y=Percent) + geom_bar(position = 'dodge', stat = 'identity') + ggplot()

my_pal = c("#EE0000FF", "#008B45FF", "#631879FF", "#1B1919FF", "#197EC0FF", "#F05C3BFF", "#E762D7FF", "#82491EFF", "#EFC000FF", "#868686FF")

my_pal = c("#FF0000FF", "#FF9900FF", "#FFCC00FF", "#00FF00FF", "#6699FFFF", "#99991EFF", "#9900CCFF", "#FF00CCFF", "#CCFFFFFF", "#FFCCCCFF", "#FFFF00FF", "#CCFF00FF", "#358000FF", "#0000CCFF", "#99CCFFFF", "#00FFFFFF", "#666600FF", "#CC99FFFF", "#666666FF", "#79CC3DFF", "#CCCC99FF", "#5050FFFF", "#CC33FFFF", "#CCCCCCFF", "#999999FF", "#CC0000FF", "#996600FF")

my_pal[c(1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23, 24, 25)]

27 colors line

ggplot(position_data_agg_3d, aes(x=position_code, y=log(percent), fill=syllable, color=syllable))+geom_line(size=1)+theme_classic()+scale_color_manual(values=my_pal)

 ggplot(agg_position_data, aes(x=position_code, y=number, fill=syllable, color=syllable)) +geom_line(size=1) +scale_color_manual(values=my_pal) + theme_classic() +geom_point() + scale_x_continuous(breaks=seq(1, 3, 1))

 ggplot(distress_position_data, aes(x=position_code, y=number, fill=syllable, color=syllable)) +geom_line(size=1) +scale_color_manual(values=my_pal[c(1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 22, 23, 24, 25)]) + theme_classic() +geom_point() + scale_x_continuous(breaks=seq(1, 4, 1)) + theme(legend.position = 'none')

改变图例顺序

ggplot(seq_type_data, aes(x=population, y=number, fill=type))+ geom_bar(stat='identity', position = 'fill') + scale_x_discrete(limits=c('BJ', 'LT', 'JA')) + scale_fill_manual(values = my_pal, labels=c('NT', 'T', 'N', 'N/NT', 'NT/T', 'NT/N', 'T/NT', 'N/T', 'T/NT/T', 'NT/T/NT', 'NT/N/NT', 'N/NT/T', 'T/NT/N/NT', 'N/NT/N/NT', 'NT/T/NT/T'), breaks = c('NT', 'T', 'N', 'N/NT', 'NT/T', 'NT/N', 'T/NT', 'N/T', 'T/NT/T', 'NT/T/NT', 'NT/N/NT', 'N/NT/T', 'T/NT/N/NT', 'N/NT/N/NT', 'NT/T/NT/T'))

改变Y轴堆叠顺序

seq_type_data$orderby = factor(seq_type_data$type, levels = c('NT', 'T', 'N', 'N/NT', 'NT/T', 'NT/N', 'T/NT', 'N/T', 'T/NT/T', 'NT/T/NT', 'NT/N/NT', 'N/NT/T', 'T/NT/N/NT', 'N/NT/N/NT', 'NT/T/NT/T'))
ggplot(seq_type_data, aes(x=population, y=number, fill=orderby))+ geom_bar(stat='identity', position = 'fill') + scale_x_discrete(limits=c('BJ', 'LT', 'JA')) + scale_fill_manual(values = my_pal)

宽数据变长数据

library(reshape2)

comm_fm_reshape <- melt(comm_fm, id.vars = "type", variable.name = "parameter", value.name = "count")

http://offline.nenu.edu.cn/DeleteOnline/DeleteServlet?device_name=%25E6%2588%2591%25E7%259A%2584%25E8%25AE%25BE%25E5%25A4%2587&&device_tmp=B0FC36192D55

ggplot(dati_echo_reshape, aes(x=type, y=count, color=type))+geom_boxplot()+facet_wrap(~parameter, scales="free_y", nrow = 1)+scale_color_manual(values=c("red", "blue"))+theme_bw()+theme(legend.position = 'none',axis.text.x = element_blank())

设置分面的排列顺序

dati_echo_lm$parameter <- factor(dati_echo_lm$parameter, levels = c('duration', 'Fstart', 'Fend', 'peak_freq', 'min_freq', 'max_freq', 'bandw'))

ggplot(dati_echo_lm, aes(x=original, y=extracted))+geom_point(shape=1)+geom_smooth(method = lm)+facet_wrap(~parameter, scales = "free", nrow = 1)+theme(axis.text.x = element_text(angle = 90))

MRPP分析比较组内差异与组间差异

library("vegan")
taep_new_dist <- vegdist(subset(subset(taep_new, select = -emitter), select=-population))
m <- monoMDS(taep_new_dist)
View(m)
plot(m$points)
taep_new_mrpp <- mrpp(taep_new_dist, grouping = taep_new$population, permutations = 999)
View(taep_new_mrpp)
taep_new_mrpp

Call:

mrpp(dat = taep_new_dist, grouping = taep_new$population, permutations = 999)

Dissimilarity index: bray

Weights for groups: n

Class means and counts:

   BJ     JA     LT

delta 0.5067 0.5044 0.567

n 11 8 12

Chance corrected within-group agreement A: 0.1201

Based on observed delta 0.5295 and expected delta 0.6017

Significance of delta: 0.001

Permutation: free

Number of permutations: 999

得到的结果中A值大于P值，即 0.1201>0.001，表明组间差异大于组内差异

ANOSIM分析比较组内差异与组间差异

taep_new_anosim <- anosim(taep_new_dist, taep_new$population, permutations = 999)
taep_new_anosim

作图：plot(taep_new_anosim)