数仓搭建 之 本地安装 + Spark + OSS

本文基于数仓搭建 之 本地安装 + Spark

本文组件版本 Hadoop 3.2.2 / Hive 3.1.2 / Spark 2.4.8 / 阿里云OSS

目录

OSS

  • 新建阿里云Bucket => hive-yl

  • 第1/3步 安装

1
2
3
4
5
6
7
8
9
10
11
cd /opt/services

wget "https://gosspublic.alicdn.com/ossfs/ossfs_1.80.6_ubuntu18.04_amd64.deb?spm=a2c4g.11186623.0.0.695f62feoThXjE&file=ossfs_1.80.6_ubuntu18.04_amd64.deb" -O ossfs_1.80.6_ubuntu18.04_amd64.deb

sudo dpkg -i ossfs_1.80.6_ubuntu18.04_amd64.deb

sudo apt --fix-broken install

sudo chown `whoami`:`whoami` /usr/local/bin/ossfs

ossfs --version
  • 第2/3步 配置
1
2
3
4
5
6
sudo vim /etc/passwd-ossfs
# hive-yl:accessKeyId:accessKeySecret

sudo chmod 640 /etc/passwd-ossfs

sudo chown `whoami`:`whoami` /etc/passwd-ossfs
  • 第3/3步 挂载
1
2
3
4
5
6
7
8
sudo mkdir /data

sudo chown `whoami`:`whoami` /data

ossfs hive-yl /data -ouid=1000 -ogid=1000 -o url=oss-cn-hangzhou-internal.aliyuncs.com

df -h | grep data
# ossfs 256T 0 256T 0% /data

Hadoop

  • 第1/2步 配置
1
2
3
cd /opt/services/hadoop-3.2.2

vim etc/hadoop/core-site.xml
1
2
3
4
5
6
7
8
9
10
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data</value>
</property>
</configuration>
1
vim etc/hadoop/hdfs-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file://${hadoop.tmp.dir}/dfs/data</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file://${hadoop.tmp.dir}/dfs/name</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>file://${hadoop.tmp.dir}/dfs/namesecondary</value>
</property>
</configuration>
  • 第2/2步 启动
1
2
3
4
5
sbin/stop-dfs.sh

bin/hdfs namenode -format

sbin/start-dfs.sh

Hive

  • 第1/3步 配置
1
2
3
cd /opt/services/apache-hive-3.1.2-bin

vim conf/hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://127.0.0.1:3306/hive?ssl=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>zhgmysql</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>spark.yarn.jars</name>
<value>hdfs://127.0.0.1:9000/spark/jars/*</value>
</property>
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
<property>
<name>hive.execution.engine</name>
<value>spark</value>
</property>
<property>
<name>hive.spark.client.connect.timeout</name>
<value>50000ms</value>
</property>
<property>
<name>hive.spark.client.server.connect.timeout</name>
<value>500000ms</value>
</property>
</configuration>
  • 第2/3步 启动
1
bin/hive
  • 第3/3步 测试
1
2
3
4
5
6
7
8
9
10
11
12
show databases;
# default

use default;

show tables;
# student

insert into student values(1, "xiaoming");

select * from student;
# 1 xiaoming
1
2
/opt/services/hadoop-3.2.2/bin/hadoop fs -ls oss://hive-yl/hive/warehouse
# drwxrwxrwx - op op 0 2022-01-19 09:37 oss://hive-yl/hive/warehouse/student

参考