赞
踩
install.packages("arrow")
library(arrow)
data <- data.frame(column1=c(1,2,3),column2=c("a","b","c"),column3=c(TRUE,FALSE,TRUE))
arrow::write_parquet(data,"C:/Users/86133/Desktop/RData/data1.parquet")
data <- arrow::read_parquet("C:/Users/86133/Desktop/RData/data1.parquet")
arrow库还提供了一些函数来处理arrow格式的数据。您可以使用以下函数对数据进行操作:
arrow::schema(data)
这将打印数据结构的元数据。
arrow::select(data, c("column1", "column2"))
这将选择指定列,并返回新的数据帧。
arrow::filter(data, column > 10)
这将根据指定条件过滤数据,并返回新的数据帧。
arrow::sort(data, column)
air_table <- arrow_table(airquality)
air_df <- as.data.frame(air_table)
# Create table
my_table <- arrow_table(data.frame(group = c("A", "B", "C"), score = c(99, 97, 99)))
# Write to Parquet
write_parquet(my_table, "my_table.parquet")
parquet_tb1 <- read_parquet("my_table.parquet")
由于参数保留为其默认值 ,因此将文件作为对象读入
class(parquet_tb1)
[1] "data.frame
my_table_arrow <- read_parquet("my_table.parquet",as_data_frame = FALSE)
class(my_table_arrow)
[1] “Table” “ArrowTabular” “ArrowObject” “R6”
# Create table to read back in
dist_time <- arrow_table(data.frame(distance = c(12.2, 15.7, 14.2), time = c(43,44, 40)))
# Write to Parquet
write_parquet(dist_time, "dist_time.parquet")
# Read in only the "time" column
time_only <- read_parquet("dist_time.parquet", col_select = "time") time_only
> time_only <- read_parquet("dist_time.parquet",col_select = "time")
my_table <- arrow_table(data.frame(group = c("A", "B", "C"), score = c(99, 97, 99)))
write_feather(my_table, "my_table.arrow")
my_feather_tbl <- read_feather("my_table.arrow")
> my_table <- arrow_table(data.frame(group = c("A","B","C"),score=c(99,97,99)))
> write_ipc_stream(my_table,"my_table.arrows")
> my_ipc_stream <- arrow::read_ipc_stream("my_table.arrows")
> write_csv_arrow(cars,"cars.csv")
my_csv <- read_csv_arrow("cars.csv",as_data_frame = FALSE)
# Create a file to read back in
tf <- tempfile()
writeLines('
{"country": "United Kingdom", "code": "GB", "long": -3.44, "lat": 55.38}
{"country": "France", "code": "FR", "long": 2.21, "lat": 46.23}
{"country": "Germany", "code": "DE", "long": 10.45, "lat": 51.17}
', tf, useBytes = TRUE)
# Read in the data
countries <- read_json_arrow(tf, col_select = c("country", "long", "lat"))
countries
countries <- read_json_arrow(tf, col_select = c("country", "long", "lat"))
单个 Parquet 文件中将数据写入磁盘。
write_dataset(dataset = airquality, path = "airquality_data")
根据数据中的列将多个 Parquet 数据文件保存到分区中的磁盘
write_dataset(airquality, "airquality_partitioned", partitioning = c("Month"))
创建了基于提供的文件夹 分区变量 .Month
list.files(“airquality_partitioned”)
[1] “Month=5” “Month=6” “Month=7” “Month=8” “Month=9”
将分区数据文件作为箭头数据集读取。
air_data <- open_dataset("airquality_partitioned_deeper")
write_dataset(dataset = airquality,path = "airquality_data_feather",format = "feather")
# write Arrow file to use in this example
write_dataset(dataset = airquality, path = "airquality_data_arrow",format = "arrow")
# read into R
open_dataset("airquality_data_arrow", format = "arrow")
open_dataset("airquality_data_arrow", format = "arrow")
write_dataset(dataset = airquality,path = "airquality_data_csv",format = "csv")
# write CSV file to use in this example
write_dataset(dataset = airquality,path = "airquality_data_csv", format = "csv")
# read into R
open_dataset("airquality_data_csv", format = "csv")
open_dataset("airquality_data_csv", format = "csv")
读取包含没有标头的 CSV 的数据集
# write CSV file to use in this example
dataset_1 <- airquality[1:40, c("Month", "Day", "Temp")]
dataset_2 <- airquality[41:80, c("Month", "Day", "Temp")]
dir.create("airquality")
write.table(dataset_1, "airquality/part-1.csv", sep = ",", row.names = FALSE, col.names = FALSE)
write.table(dataset_2, "airquality/part-2.csv", sep = ",", row.names = FALSE, col.names = FALSE)
# read into R
open_dataset("airquality", format = "csv", column_names = c("Month", "Day", "Temp"))
open_dataset("airquality", format = "csv", column_names = c("Month", "Day", "Temp"))
如果数据集由无标头 CSV 文件组成,则必须提供 每列。您可以通过多种方式执行此操作 - 通过参数(如上所示)或通过架构:column_names
open_dataset(“airquality”, format = “csv”, schema = schema(“Month” = int32(), “Day” = int32(), “Temp” = int32()))
R 中的现有向量转换为箭头数组对象。
> score <- c(97.99,86)
> score_array <- Array$create(score)
将 R 中的现有数据框转换为箭头表对象
> my_tibble <- tibble::tibble(group=c("A","B","C"),score=c(99,45,89)) #创建数据框
> my_table <- arrow_table(my_tibble) #数据框转换为箭头表
dplyr::collect(my_table)
将 R 中的现有数据框转换为箭头记录批处理对象
> my_tibble <- tibble::tibble(group=c("A","B","C"),score=c(99,45,89)) #创建数据框
> my_record <- record_batch(my_tibble) #数据框转换为记录批处理对象
> integer_arr <- Array$create(1:5) #创建一个数组’
> uint_arr <- integer_arr$cast(target_type = uint8()) #转换数组类型
更改现有箭头表中一个或多个字段的类型。
oscars <- tibble::tibble(actor = c("Katharine Hepburn", "Meryl Streep", "Jack Nicholson"),num_awards = c(4, 3, 3)) #创建一个数据框
oscars_arrow <- arrow_table(oscars) #转换为箭头表形式
oscars_schema <- schema(actor = string(), num_awards = int16()) #设置字段新的数据类型
oscars_arrow_int <- oscars_arrow$cast(target_schema = oscars_schema) #箭头表的数据类型的转换
oscars <- tibble::tibble(actor = c("Katharine Hepburn", "Meryl Streep", "Jack Nicholson"),num_awards = c(4, 3, 3) )#创建一个数据框
oscars_schema <- schema(actor = string(), num_awards = int16()) #设置字段结构
scars_data_arrow <- arrow_table(oscars, schema = oscars_schema) #将数据框转换为箭头表,且进行数据类型的指定
读取文件时手动指定箭头数据类型。
oscars <- tibble::tibble(actor = c("Katharine Hepburn", "Meryl Streep", "Jack Nicholson"),num_awards = c(4, 3, 3) )#创建一个数据框
write_dataset(oscars, path = "oscars_data") #写一个数据集到内存上
oscars_schema <- schema(actor = string(), num_awards = int16()) #设置列名的数据类型
oscars_dataset_arrow <- open_dataset("oscars_data", schema = oscars_schema) #打开文件同时,指定列的类型
在 Array 中搜索与谓词条件匹配的值。
my_values <- Array$create(c(1:5, NA))
my_values[my_values > 3] #筛选数
my_values <- Array$create(c(1:5, NA))
mean(my_values, na.rm = TRUE) #除去里面的null值,进行求平均值
repeated_vals <- Array$create(c(1, 1, 2, 3, 3, 3, 3, 3))
value_counts(repeated_vals)
Array 对象上使用各种算术运算符
num_array <- Array$create(1:10)
num_array + 10
first_100_numbers <- Array$create(1:100)
call_function("variance", first_100_numbers, options = list(ddof = 0)) #计算1-100的方差,将增量自由度设置为0
将箭头与使用 dplyr 语法
arrow_table(starwars) %>%
filter(species == "Human") %>%
mutate(height_ft = height/30.48) %>%
select(name, height_ft) %>%
collect()
library(dplyr)
arrow_table(starwars) %>%
filter(species == "Human", homeworld == "Tatooine") %>%
collect()
> arrow_table(starwars) %>%
+ filter(str_detect(name,"Darth")) %>%
+ collect()
使用 PyArrow 在 R 会话中创建 Arrow 对象。
library(reticulate)
pa <- import("pyarrow") #导包
pyarrow_scalar <- pa$scalar(42) #创建箭头对象
pyarrow_scalar
调用函数
table_1 <- arrow_table(mtcars[1:5,])
table_2 <- arrow_table(mtcars[11:15,])
pa$concat_tables(tables = list(table_1, table_2)) %>%
collect()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。