R语言之dplyr 数据清理

共计 4238 个字符，预计需要花费 11 分钟才能阅读完成。

## Chapter0 dplyr 介绍

dplyr 是一款用于数据整理的 R 包

“`{r Load dplyr package}

# Load dplyr package

if(!suppressWarnings(require(dplyr)))

{

install.packages(‘dplyr’)

require(dplyr)

}

“`

## Chapter1 数据导入

介绍如何将文本格式的数据导入到 R 的内存，主要介绍 read.table 和 read.csv 函数，及常用参数的使用

### read.table

read.table(

## file path

file,

## 1st line as header/column name

header = FALSE,

## separator strings

sep = “”,

## how manhy rows need read

nrows = -1,

## how manuy rows need skip

skip = 0,

## not available data define as NA

fill = !blank.lines.skip)

“`{r read.table instruction}

#example

read.table(file = “dplyr-data/read.table/file1.txt”)

# V1 V2 V3

# 1 name age height

# 2 John 10 150

# 3 Jack 27 180

# 4 Mary 29 167

read.table(file = “dplyr-data/read.table/file1.txt”,

header = TRUE)

# name age height

# 1 John 10 150

# 2 Jack 27 180

# 3 Mary 29 167

file1.data <- read.table(file = “dplyr-data/read.table/file1.txt”,

header = TRUE)

file1.data

# sep parameter

read.table(file = “dplyr-data/read.table/file1.txt”,

header = TRUE,

sep = ” “)

# name.age.height

# 1 Johnt10t150

# 2 Jackt27t180

# 3 Maryt29t167

# tab and new row

cat(“ttt1”)

# 1

cat(“nnt1”)

# 1

read.table(file = “dplyr-data/read.table/file1.txt”,

header = TRUE,

sep = “t”)

# name age height

# 1 John 10 150

# 2 Jack 27 180

# 3 Mary 29 167

# sep is a comma

read.table(file = “dplyr-data/read.table/file2.txt”,

header = TRUE,

sep = “,”)

# name age height

# 1 John 10 150

# 2 Jack 27 180

# 3 Mary 29 167

# skip & fill parameter

read.table(file = “dplyr-data/read.table/file3.txt”,

header = TRUE)

# Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :

# line 4 did not have 3 elements

read.table(file = “dplyr-data/read.table/file3.txt”,

header = TRUE,

nrows = 4)

# Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :

# line 4 did not have 3 elements

read.table(file = “dplyr-data/read.table/file3.txt”,

header = TRUE,

nrows = 3)

# name age height

# 1 John 10 150

# 2 Jack 27 180

# 3 Mary 29 167

# but we missing 1 row, so skip parameter is used as follow

read.table(file = “dplyr-data/read.table/file3.txt”,

header = FALSE,

skip = 5)

# V1 V2 V3

# 1 Steven 45 175

# skip 1st 5 rows and no header

read.table(file = “dplyr-data/read.table/file3.txt”,

header = TRUE,

fill = T)

# fill missing values as NA

# name age height

# 1 John 10 150

# 2 Jack 27 180

# 3 Mary 29 167

# 4 DDD NA NA

# 5 Steven 45 175

“`

### read.csv

“`{r import order data by read.csv}

# import data and change to tbl object

order <- read.csv(“dplyr-data/order.csv”)

head(order,5)

order_tbl <- tbl_df(order)

head(order_tbl, 5)

“`

## Chapter2 tbl 对象的介绍

本节内容介绍如果用 data.frame 对象构造成 tbl 对象

tbl 是 dplyr 定义的数据类型，可以接受 data.frame,cube,sql

“`{r import data by read.table}

# import data

order <- read.table(file = “dplyr-data/order.csv”,

header = T,

sep = “,”)

class(order)

# [1] “data.frame”

head(order)

tail(order)

# Load dplyr package

if(!suppressWarnings(require(dplyr)))

{

install.packages(‘dplyr’)

require(dplyr)

}

# change to tbl use tbl_df function

order_tbl <- tbl_df(order)

class(order_tbl)

# [1] “tbl_df” “tbl” “data.frame”

order_tbl

# for a data.frame, is not nessary to change to tbl object

# for sql object, can useful to use dplyr to manupulate data

# another way to read data

library(readr)

order <- read_csv(“E:/03-Download/dplyr/dplyr-data/order.csv”)

View(order)

“`

## Chapter3 数据筛选 –filter 函数

filter(tbl/data.fram, condition) and output a data.frame, filter rows/observation

“`{r filter in dplyr}

# Load dplyr package in a safer way

if(!suppressWarnings(require(dplyr)))

{

install.packages(‘dplyr’)

require(dplyr)

}

df <- data.frame(

color = c(“blue”, “black”, “blue”, “blue”, “black”),

value = 1:5

)

tbl <- tbl_df(df)

tbl

# # A tibble: 5 × 2

# color value

# <fctr> <int>

# 1 blue 1

# 2 black 2

# 3 blue 3

# 4 blue 4

# 5 black 5

# filter a value that match some condition

filter(tbl, color == “blue”)

# # A tibble: 3 × 2

# color value

# <fctr> <int>

# 1 blue 1

# 2 blue 3

# 3 blue 4

# filter value in 1 or 4

filter(tbl, value %in% c(1,4))

# A tibble: 2 × 2

# color value

# <fctr> <int>

# 1 blue 1

# 2 blue 4

# import data and change to tbl object

order <- read.csv(“dplyr-data/order.csv”)

head(order,5)

order_tbl <- tbl_df(order)

head(order_tbl, 5)

# filter order data is 2009-10-13 rows

filter(order_tbl, orderdate == “2009-10-13”)

# filter order data is 2009-10-13 rows and total price greater than 100 rows, 1 row match and assign to filterData

filterData <- filter(order_tbl, orderdate == “2009-10-13” & totalprice > 100)

View(filterData)

“`

## Chapter4 子集选取函数 –select

select columns/variable by name/match rules

“`{r select function in dplyr}

# Load dplyr package in a safer way

if(!suppressWarnings(require(dplyr)))

{

install.packages(‘dplyr’)

require(dplyr)

}

df <- data.frame(

color = c(“blue”, “black”, “blue”, “blue”, “black”),

value = 1:5

)

tbl <- tbl_df(df)

tbl

# just select color column

select(tbl, color)

# A tibble: 5 × 1

# color

# <fctr>

# 1 blue

# 2 black

# 3 blue

# 4 blue

# 5 black

# select all other columns except color column equals select value column

select(tbl, -color)

# A tibble: 5 × 1

# value

# <int>

# 1 1

正文完