共计 4238 个字符,预计需要花费 11 分钟才能阅读完成。
## Chapter0 dplyr 介绍
dplyr 是一款用于数据整理的 R 包
“`{r Load dplyr package}
# Load dplyr package
if(!suppressWarnings(require(dplyr)))
{
install.packages(‘dplyr’)
require(dplyr)
}
“`
## Chapter1 数据导入
介绍如何将文本格式的数据导入到 R 的内存,主要介绍 read.table 和 read.csv 函数,及常用参数的使用
### read.table
read.table(
## file path
file,
## 1st line as header/column name
header = FALSE,
## separator strings
sep = “”,
## how manhy rows need read
nrows = -1,
## how manuy rows need skip
skip = 0,
## not available data define as NA
fill = !blank.lines.skip)
“`{r read.table instruction}
#example
read.table(file = “dplyr-data/read.table/file1.txt”)
# V1 V2 V3
# 1 name age height
# 2 John 10 150
# 3 Jack 27 180
# 4 Mary 29 167
read.table(file = “dplyr-data/read.table/file1.txt”,
header = TRUE)
# name age height
# 1 John 10 150
# 2 Jack 27 180
# 3 Mary 29 167
file1.data <- read.table(file = “dplyr-data/read.table/file1.txt”,
header = TRUE)
file1.data
# sep parameter
read.table(file = “dplyr-data/read.table/file1.txt”,
header = TRUE,
sep = ” “)
# name.age.height
# 1 Johnt10t150
# 2 Jackt27t180
# 3 Maryt29t167
# tab and new row
cat(“ttt1”)
# 1
cat(“nnt1”)
#
#
# 1
read.table(file = “dplyr-data/read.table/file1.txt”,
header = TRUE,
sep = “t”)
# name age height
# 1 John 10 150
# 2 Jack 27 180
# 3 Mary 29 167
# sep is a comma
read.table(file = “dplyr-data/read.table/file2.txt”,
header = TRUE,
sep = “,”)
# name age height
# 1 John 10 150
# 2 Jack 27 180
# 3 Mary 29 167
# skip & fill parameter
read.table(file = “dplyr-data/read.table/file3.txt”,
header = TRUE)
# Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
# line 4 did not have 3 elements
read.table(file = “dplyr-data/read.table/file3.txt”,
header = TRUE,
nrows = 4)
# Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
# line 4 did not have 3 elements
read.table(file = “dplyr-data/read.table/file3.txt”,
header = TRUE,
nrows = 3)
# name age height
# 1 John 10 150
# 2 Jack 27 180
# 3 Mary 29 167
# but we missing 1 row, so skip parameter is used as follow
read.table(file = “dplyr-data/read.table/file3.txt”,
header = FALSE,
skip = 5)
# V1 V2 V3
# 1 Steven 45 175
# skip 1st 5 rows and no header
read.table(file = “dplyr-data/read.table/file3.txt”,
header = TRUE,
fill = T)
# fill missing values as NA
# name age height
# 1 John 10 150
# 2 Jack 27 180
# 3 Mary 29 167
# 4 DDD NA NA
# 5 Steven 45 175
“`
### read.csv
“`{r import order data by read.csv}
# import data and change to tbl object
order <- read.csv(“dplyr-data/order.csv”)
head(order,5)
order_tbl <- tbl_df(order)
head(order_tbl, 5)
“`
## Chapter2 tbl 对象的介绍
本节内容介绍如果用 data.frame 对象构造成 tbl 对象
tbl 是 dplyr 定义的数据类型,可以接受 data.frame,cube,sql
“`{r import data by read.table}
# import data
order <- read.table(file = “dplyr-data/order.csv”,
header = T,
sep = “,”)
class(order)
# [1] “data.frame”
head(order)
tail(order)
# Load dplyr package
if(!suppressWarnings(require(dplyr)))
{
install.packages(‘dplyr’)
require(dplyr)
}
# change to tbl use tbl_df function
order_tbl <- tbl_df(order)
class(order_tbl)
# [1] “tbl_df” “tbl” “data.frame”
order_tbl
# for a data.frame, is not nessary to change to tbl object
# for sql object, can useful to use dplyr to manupulate data
# another way to read data
library(readr)
order <- read_csv(“E:/03-Download/dplyr/dplyr-data/order.csv”)
View(order)
“`
## Chapter3 数据筛选 –filter 函数
filter(tbl/data.fram, condition) and output a data.frame, filter rows/observation
“`{r filter in dplyr}
# Load dplyr package in a safer way
if(!suppressWarnings(require(dplyr)))
{
install.packages(‘dplyr’)
require(dplyr)
}
df <- data.frame(
color = c(“blue”, “black”, “blue”, “blue”, “black”),
value = 1:5
)
tbl <- tbl_df(df)
tbl
# # A tibble: 5 × 2
# color value
# <fctr> <int>
# 1 blue 1
# 2 black 2
# 3 blue 3
# 4 blue 4
# 5 black 5
# filter a value that match some condition
filter(tbl, color == “blue”)
# # A tibble: 3 × 2
# color value
# <fctr> <int>
# 1 blue 1
# 2 blue 3
# 3 blue 4
# filter value in 1 or 4
filter(tbl, value %in% c(1,4))
# A tibble: 2 × 2
# color value
# <fctr> <int>
# 1 blue 1
# 2 blue 4
# import data and change to tbl object
order <- read.csv(“dplyr-data/order.csv”)
head(order,5)
order_tbl <- tbl_df(order)
head(order_tbl, 5)
# filter order data is 2009-10-13 rows
filter(order_tbl, orderdate == “2009-10-13”)
# filter order data is 2009-10-13 rows and total price greater than 100 rows, 1 row match and assign to filterData
filterData <- filter(order_tbl, orderdate == “2009-10-13” & totalprice > 100)
View(filterData)
“`
## Chapter4 子集选取函数 –select
select columns/variable by name/match rules
“`{r select function in dplyr}
# Load dplyr package in a safer way
if(!suppressWarnings(require(dplyr)))
{
install.packages(‘dplyr’)
require(dplyr)
}
df <- data.frame(
color = c(“blue”, “black”, “blue”, “blue”, “black”),
value = 1:5
)
tbl <- tbl_df(df)
tbl
# just select color column
select(tbl, color)
# A tibble: 5 × 1
# color
# <fctr>
# 1 blue
# 2 black
# 3 blue
# 4 blue
# 5 black
# select all other columns except color column equals select value column
select(tbl, -color)
# A tibble: 5 × 1
# value
# <int>
# 1 1
正文完