数仓搭建 之 阿里云HDFS

本文基于数仓搭建 之 本地安装

本文组件版本 文件系统HDFS版 / Hadoop 2.7.2 / Hive 2.3.9

目录

阿里云HDFS

  • 文件系统HDFS版 截止2022年02月6号仍在公测中

  • 创建文件系统 => 添加挂载点

注意ECS实例需要满足前提条件

1
2
3
cd /opt/services/

wget https://repo1.maven.org/maven2/com/aliyun/dfs/aliyun-sdk-dfs/1.0.5/aliyun-sdk-dfs-1.0.5.jar

Hadoop

1
2
3
4
5
cd /opt/services

wget http://archive.apache.org/dist/hadoop/core/hadoop-2.7.2/hadoop-2.7.2.tar.gz

tar xf hadoop-2.7.2.tar.gz && cd hadoop-2.7.2
  • 第1/2步 配置
1
vim etc/hadoop/hadoop-env.sh
1
2
export JAVA_HOME=/opt/services/jdk
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/services/aliyun-sdk-dfs-1.0.5.jar
1
vim etc/hadoop/core-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
<configuration>
<property>
<name>fs.defaultFS</name>
<value>dfs://f-2b786b7aage46.cn-hangzhou.dfs.aliyuncs.com:10290</value>
</property>
<property>
<name>fs.dfs.impl</name>
<value>com.alibaba.dfs.DistributedFileSystem</value>
</property>
<property>
<name>fs.AbstractFileSystem.dfs.impl</name>
<value>com.alibaba.dfs.DFS</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>4194304</value>
<description>To achieve high throughput, no less than 1MB, no more than 8MB</description>
</property>
<property>
<name>dfs.connection.count</name>
<value>1</value>
<description>If multi threads in the same process will read/write to DFS, set to count of threads</description>
</property>
</configuration>
  • 第2/2步 测试
1
2
3
4
5
bin/hdfs dfs -mkdir /test

bin/hdfs dfs -ls /
# Found 1 items
# drwxrwxrwx - op op 0 2022-02-06 17:04 /test

YARN

1
2
3
cd /opt/services/hadoop-2.7.2

cp /opt/services/aliyun-sdk-dfs-1.0.5.jar share/hadoop/hdfs
  • 第1/3步 配置
1
vim etc/hadoop/mapred-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>1024</value>
</property>
</configuration>
1
vim etc/hadoop/yarn-site.xml
1
2
3
4
5
6
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
  • 第2/3步 启动
1
2
3
sbin/start-yarn.sh

jps
1
2
2451 ResourceManager
2740 NodeManager
1
2
3
4
5
6
7
8
9
10
11
12
13
echo "hello" >> /tmp/test1.txt
echo "world" >> /tmp/test1.txt

echo "hello" >> /tmp/test2.txt
echo "hadoop" >> /tmp/test2.txt

bin/hadoop fs -mkdir /input

bin/hadoop fs -put /tmp/test*.txt /input

bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.2.jar wordcount /input /output

bin/hadoop fs -cat /output/part-r-00000
1
2
3
hadoop	1
hello 2
world 1

Hive

准备好MySQL服务 并新建数据库hive

1
2
3
4
5
cd /opt/services

wget https://mirrors.tuna.tsinghua.edu.cn/apache/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz

tar xf apache-hive-2.3.9-bin.tar.gz && cd apache-hive-2.3.9-bin
  • 第1/3步 配置
1
vim conf/hive-env.sh
1
2
3
export HADOOP_HOME=/opt/services/hadoop-2.7.2
export HIVE_CONF_DIR=/opt/services/apache-hive-2.3.9-bin/conf
export HIVE_AUX_JARS_PATH=/opt/services/apache-hive-2.3.9-bin/lib
1
vim conf/hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://127.0.0.1:3306/hive?ssl=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>zhgmysql</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
</configuration>
1
cp mysql-connector-java-8.0.28.jar /opt/services/apache-hive-2.3.9-bin/lib/
  • 第2/3步 启动
1
2
3
bin/schematool -dbType mysql -initSchema

bin/hive
  • 第3/3步 测试
1
2
3
4
5
6
7
8
9
10
11
12
13
14
show databases;
# default

use default;

create table student(id int, name string);

show tables;
# student

insert into student values(1, "xiaoming");

select * from student;
# 1 xiaoming

参考