416 Midterm Answer ------------------ Script to create and copy files to HDFS for use with Hive: hadoop fs -mkdir /user/rlawrenc/416/tpch hadoop fs -mkdir /user/rlawrenc/416/tpch/region hadoop fs -mkdir /user/rlawrenc/416/tpch/nation hadoop fs -mkdir /user/rlawrenc/416/tpch/part hadoop fs -mkdir /user/rlawrenc/416/tpch/supplier hadoop fs -mkdir /user/rlawrenc/416/tpch/partsupp hadoop fs -mkdir /user/rlawrenc/416/tpch/customer hadoop fs -mkdir /user/rlawrenc/416/tpch/orders hadoop fs -mkdir /user/rlawrenc/416/tpch/lineitem hadoop fs -put region.tbl /user/rlawrenc/416/tpch/region/region.tbl hadoop fs -put nation.tbl /user/rlawrenc/416/tpch/nation/nation.tbl hadoop fs -put part.tbl /user/rlawrenc/416/tpch/part/part.tbl hadoop fs -put supplier.tbl /user/rlawrenc/416/tpch/supplier/supplier.tbl hadoop fs -put partsupp.tbl /user/rlawrenc/416/tpch/partsupp/partsupp.tbl hadoop fs -put customer.tbl /user/rlawrenc/416/tpch/customer/customer.tbl hadoop fs -put order.tbl /user/rlawrenc/416/tpch/orders/order.tbl hadoop fs -put lineitem.tbl /user/rlawrenc/416/tpch/lineitem/lineitem.tbl DDL to create tables in Hive: CREATE EXTERNAL TABLE region (r_regionkey INT, r_name STRING, r_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/region'; CREATE EXTERNAL TABLE nation (n_nationkey INT, n_name STRING, n_regionkey INT, n_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/nation'; CREATE EXTERNAL TABLE part (p_partkey INT, p_name STRING, p_mfgr STRING, p_brand STRING, p_type STRING, p_size INT, p_container STRING, p_retailprice DOUBLE, p_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/part'; CREATE EXTERNAL TABLE supplier (s_suppkey INT, s_name STRING, s_address STRING, s_nationkey INT, s_phone STRING, s_acctbal DOUBLE, s_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/supplier'; CREATE EXTERNAL TABLE partsupp (ps_partkey INT, ps_suppkey INT, ps_availqty INT, ps_supplycost DOUBLE, ps_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/partsupp'; CREATE EXTERNAL TABLE customer (c_custkey INT, c_name STRING, c_address STRING, c_nationkey INT, c_phone STRING, c_acctbal DOUBLE, c_mktsegment STRING, c_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/customer'; CREATE EXTERNAL TABLE orders (o_orderkey INT, o_custkey INT, o_orderstatus STRING, o_totalprice DOUBLE, o_orderdate STRING, o_orderpriority STRING, o_clerk STRING, o_shippriority INT, o_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/orders'; CREATE EXTERNAL TABLE lineitem (l_orderkey INT, l_partkey INT, l_suppkey INT, l_linenumber INT, l_quantity INT, l_extendedprice DOUBLE, l_discount DOUBLE, l_tax DOUBLE, l_returnflag STRING, l_linestatus STRING, l_shipdate STRING,l_commitdate STRING,l_receiptdate STRING, l_shipinstruct STRING, l_shipmode STRING, l_comment STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/user/rlawrenc/416/tpch/lineitem'; Hive Questions -------------- /* Q1 - Top 10 customers by items ordered from Manufacturer#3 */ INSERT OVERWRITE LOCAL DIRECTORY 'hive_q1' SELECT c_custkey, c_name, SUM(l_quantity) totalItems FROM Customer C INNER JOIN Orders O ON C.c_custkey = O.o_custkey INNER JOIN LineItem LI ON O.o_orderkey = LI.l_orderkey INNER JOIN Part P ON LI.l_partkey = P.p_partkey WHERE P.p_mfgr = 'Manufacturer#3' GROUP BY c_custkey, c_name ORDER BY totalItems DESC LIMIT 10; /* Q2 - Return the number and percentage of customers in each nation that have an order (o_totalprice) over $400,000.*/ INSERT OVERWRITE LOCAL DIRECTORY 'hive_q2' SELECT N.n_name, COUNT(C.c_custkey) nationCustomers, COUNT(C.c_custkey)*100.0/C2.totalCustomers percentOfAllCustomers FROM Nation N LEFT OUTER JOIN (SELECT * FROM Customer C LEFT SEMI JOIN Orders O ON C.c_custkey = O.o_custkey and O.o_totalprice > 400000) C ON N.n_nationkey = C.c_nationkey CROSS JOIN (SELECT COUNT(c_custkey) as totalCustomers FROM Customer) C2 GROUP BY N.n_name, C2.totalCustomers; /* Simpler variants */ /* List of customers by nation. +1 mark */ SELECT n_name, SUM(c_custkey) nationCustomers FROM Nation N LEFT OUTER JOIN Customer C ON N.n_nationkey = C.c_nationkey GROUP BY n_name; /* List of customers by nation with o_totalprice > 400000 +2 marks */ SELECT n_name, COUNT(C.c_custkey) nationCustomers FROM Nation N LEFT OUTER JOIN (SELECT * FROM Customer C LEFT SEMI JOIN Orders O ON C.c_custkey = O.o_custkey and O.o_totalprice > 400000) C ON N.n_nationkey = C.c_nationkey GROUP BY n_name; Pig Questions ------------- /* Q1 - Return the average number of lineitems per order with a quantity over 5. */ L = LOAD '/user/rlawrenc/416/tpch/lineitem/lineitem.tbl' using PigStorage() AS (l_orderkey:int, l_partkey:int, l_suppkey:int, l_linenumber:int, l_quantity:int, l_extendedprice:double, l_discount:double, l_tax:double, l_returnflag:chararray, l_linestatus:chararray, l_shipdate:chararray, l_commitdate:chararray, l_receiptdate:chararray, l_shipinstruct:chararray, l_shipmode:chararray, l_comment:chararray); Over5 = FILTER L BY l_quantity > 5; G = GROUP Over5 BY l_orderkey; OrderCounts = FOREACH G GENERATE FLATTEN(group), COUNT(Over5.l_quantity) as cnt; G2 = GROUP OrderCounts ALL; X = FOREACH G2 GENERATE AVG(OrderCounts.cnt); DUMP X; STORE X INTO '/user/rlawrenc/416/midterm/pig_q1'; Output: (3.6631607518490874) Corresponding Hive query: SELECT AVG(cnt) FROM (SELECT l_orderkey, COUNT(l_quantity) as cnt FROM Lineitem WHERE l_quantity > 5 GROUP BY l_orderkey) X; /* Q2 - Return the top 5 part suppliers by dollar amount (l_quantity*l_extendedprice) to Canadian customers. */ C = LOAD '/user/rlawrenc/416/tpch/customer/customer.tbl' using PigStorage() AS (c_custkey:int, c_name:chararray, c_address:chararray, c_nationkey:int, c_phone:chararray, c_acctbal:double, c_mktsegment:chararray, c_comment:chararray); O = LOAD '/user/rlawrenc/416/tpch/orders/order.tbl' using PigStorage() AS (o_orderkey:int, o_custkey:int, o_orderstatus:chararray, o_totalprice:double, o_orderdate:chararray, o_orderpriority:chararray, o_clerk:chararray, o_shippriority:int, o_comment:chararray); L = LOAD '/user/rlawrenc/416/tpch/lineitem/lineitem.tbl' using PigStorage() AS (l_orderkey:int, l_partkey:int, l_suppkey:int, l_linenumber:int, l_quantity:int, l_extendedprice:double, l_discount:double, l_tax:double, l_returnflag:chararray, l_linestatus:chararray, l_shipdate:chararray, l_commitdate:chararray, l_receiptdate:chararray, l_shipinstruct:chararray, l_shipmode:chararray, l_comment:chararray); S = LOAD '/user/rlawrenc/416/tpch/supplier/supplier.tbl' using PigStorage() AS (s_suppkey:int, s_name:chararray, s_address:chararray, s_nationkey:int, s_phone:chararray, s_acctbal:double, s_comment:chararray); N = LOAD '/user/rlawrenc/416/tpch/nation/nation.tbl' using PigStorage() AS (n_nationkey:int, n_name:chararray, n_regionkey:int, n_comment:chararray); CdnNation = FILTER N BY n_name == 'CANADA'; CdnCust = JOIN C BY c_nationkey, CdnNation BY n_nationkey; CdnOrd = JOIN CdnCust BY c_custkey, O BY o_custkey; CdnItems = JOIN CdnOrd BY o_orderkey, L BY l_orderkey; CdnItemSuppliers = JOIN S BY s_suppkey, CdnItems BY l_suppkey; X = FOREACH CdnItemSuppliers GENERATE s_suppkey, s_name, l_quantity*l_extendedprice as orderTotal; G = GROUP X BY (s_suppkey, s_name); Y = FOREACH G GENERATE FLATTEN(group), SUM(X.orderTotal) as sumTotal; Z = ORDER Y BY sumTotal DESC; Z2 = LIMIT Z 5; dump Z2; STORE Z2 INTO '/user/rlawrenc/416/midterm/pig_q2'; Corresponding Hive query: SELECT S.s_suppkey, S.s_name, SUM(L.l_quantity*L.l_extendedprice) as orderTotal FROM Nation N INNER JOIN Customer C ON N.n_nationkey = C. c_nationkey INNER JOIN Orders O ON C.c_custkey = O.o_custkey INNER JOIN Lineitem L ON O.o_orderkey = L.l_orderkey INNER JOIN Supplier S ON L.l_suppkey = S.s_suppkey WHERE N.n_name = 'Canada' GROUP BY S.s_suppkey, S.s_name ORDER BY orderTotal DESC LIMIT 5; Map Reduce Question ------------------- Given a partkey, return the total number of times it was in an order, the total quantity ordered over all orders, and the average quantity ordered per order. Equivalent Hive query: SELECT l_partkey, COUNT(l_orderkey), SUM(l_quantity), SUM(l_quantity)/COUNT(l_orderkey) FROM Lineitem WHERE l_partkey = 25 GROUP BY l_partkey;