数仓搭建 之 阿里云HDFS + Spark

本文基于数仓搭建 之 阿里云HDFS & 数仓搭建 之 本地安装 + Spark

本文组件版本 文件系统HDFS版 / Hadoop 2.7.2 / Hive 2.3.9 / Spark 2.4.8

目录

Spark

1
2
3
4
5
cd /opt/services

wget https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz

tar xf spark-2.4.8-bin-without-hadoop.tgz && cd spark-2.4.8-bin-without-hadoop
1
2
3
4
5
/opt/services/hadoop-2.7.2/bin/hadoop fs -mkdir -p /spark/jars

/opt/services/hadoop-2.7.2/bin/hadoop fs -put jars/* /spark/jars

/opt/services/hadoop-2.7.2/bin/hadoop fs -ls /spark/jars
1
vim conf/spark-defaults.conf
1
2
3
spark.master                     yarn
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 2g
1
vim conf/spark-env.sh
1
2
export HADOOP_CONF_DIR=/opt/services/hadoop-2.7.2/etc/hadoop
export YARN_CONF_DIR=/opt/services/hadoop-2.7.2/etc/hadoop
1
vim ~/.bashrc
1
2
3
4
export SPARK_HOME=/opt/services/spark-2.4.8-bin-without-hadoop
export SPARK_CONF=/opt/services/spark-2.4.8-bin-without-hadoop/conf
export PATH=$SPARK_HOME/bin:$PATH
export SPARK_DIST_CLASSPATH=$(/opt/services/hadoop-2.7.2/bin/hadoop classpath)
1
source ~/.bashrc

YARN

1
2
3
cd /opt/services/hadoop-2.7.2

vim etc/hadoop/yarn-site.xml
1
2
3
4
5
6
7
8
9
10
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
1
2
3
sbin/stop-yarn.sh

sbin/start-yarn.sh

Hive

1
2
3
4
5
6
7
8
9
cd /opt/services/apache-hive-2.3.9-bin

cp /opt/services/spark-2.4.8-bin-without-hadoop/jars/scala-library-2.11.12.jar lib/

cp /opt/services/spark-2.4.8-bin-without-hadoop/jars/spark-core_2.11-2.4.8.jar lib/

cp /opt/services/spark-2.4.8-bin-without-hadoop/jars/spark-network-common_2.11-2.4.8.jar lib/

cp /opt/services/spark-2.4.8-bin-without-hadoop/jars/spark-unsafe_2.11-2.4.8.jar lib/
1
vim conf/hive-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://127.0.0.1:3306/hive?ssl=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>zhgmysql</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>spark.yarn.jars</name>
<value>dfs://f-2b786b7aage46.cn-hangzhou.dfs.aliyuncs.com:10290/spark/jars/*</value>
</property>
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
<property>
<name>hive.execution.engine</name>
<value>spark</value>
</property>
</configuration>
1
bin/hive
1
2
3
4
5
6
7
insert into student values(3, "xiaohong");

insert into student values(4, "xiaoyuan");

insert into student values(5, "xiaoma");

select * from student where id >= 3;
1
2
3
3	xiaohong
4 xiaoyuan
5 xiaoma

参考