/* Q1 - List All Games */ G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); STORE G INTO '/user/rlawrenc/416/lab3/q1/'; dump G; /* Q2 - Find a Game by Id */ G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); A = FILTER G BY gid == 45; STORE A INTO '/user/rlawrenc/416/lab3/q2/'; DUMP A; /* Q3 - Players over 18 */ /* Need to build a udf.jar first that contains the AGE() function. */ /* Need hadoop library (hadoop-core-1.0.4.jar) and pig library (pig-0.10.1-core.jar). These libraries are in /usr/share/hadoop and usr/share/pig. */ REGISTER udf.jar; P = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); R = FOREACH P GENERATE *, udf.AGE(bdate); A = FILTER R BY $6 >= 18; B = ORDER A BY $6; STORE B INTO '/user/rlawrenc/416/lab3/q3/'; DUMP B; /* Q4 - Number of Players per Game */ PG = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); A = GROUP PG BY gid; B = FOREACH A GENERATE group, COUNT(PG); STORE B INTO '/user/rlawrenc/416/lab3/q4/'; DUMP B; /* Q5 - List the Top 10 Scores for a Game */ PG = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); A = FILTER PG BY gid == 45; B = ORDER A BY score DESC; C = FOREACH B GENERATE pid, score; D = LIMIT C 10; STORE D INTO '/user/rlawrenc/416/lab3/q5/'; DUMP D; /* Q6 - Number of Players Two Games have in Common */ PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); PG2 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); A = JOIN PG1 BY pid, PG2 BY pid; B = GROUP A BY (PG1::gid, PG2::gid); C = FOREACH B GENERATE FLATTEN(group), COUNT(A); D = FILTER C BY $0 != $1; STORE D INTO '/user/rlawrenc/416/lab3/q6'; /* Q7 - Count Top Players for Each Game */ /* Tests left outer join */ /* Games 6,7,95 (and others) do not have any players above 98,000 score. */ PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); A = FILTER PG1 BY score > 98000; D = JOIN G BY gid LEFT OUTER, A BY gid; B = GROUP D BY G::gid; E = FOREACH B GENERATE group, MAX(D.G::name), COUNT(D.$6); /* A.pid (will be null if no player above 98000) */ STORE E INTO '/user/rlawrenc/416/lab3/q7'; DUMP E; /* Q8 - List good players or play 'Electronic Arts' games */ /* Retrieve EA players */ G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); A = FILTER G BY publisher == 'Electronic Arts'; PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); C = JOIN A BY gid, PG1 BY gid; D = FOREACH C GENERATE PG1::pid; /* Retrieve good players */ PG2 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); R = FILTER PG2 BY score > 90000; S = FOREACH R GENERATE pid; /* Union to sets together */ T = UNION S, D; U = DISTINCT T; /* Perform join to get player id and player name */ P = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); X = JOIN P BY pid, U by pid; Y = FOREACH X GENERATE P::pid, fname, lname; STORE Y INTO '/user/rlawrenc/416/lab3/q8'; DUMP Y; /* Note: Check to make sure 998 player does not appear. */ /* Q9 original - Most Popular Games per Publisher by Gender */ /* Determine player counts by game and gender */ PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); P = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); A = JOIN P BY pid, PG1 by pid; B = GROUP A BY (gid, sex); C = FOREACH B GENERATE FLATTEN(group), COUNT(A); D = C; E = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); F = JOIN E BY gid, D BY $0; U = FOREACH F GENERATE $2, $6, $7, $0, $1; /* publisher name, sex, countPlayerOfSex, gid, game name */ /* Join with Games to get publisher info. */ G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); R = JOIN C BY gid, G BY gid; S = GROUP R BY (publisher, sex); /* Select the maximum # of players for each (publisher, sex) group. */ T = FOREACH S GENERATE FLATTEN(group), MAX(R.$2); V = JOIN U BY $0..$2, T BY $0..$2; Z = FOREACH V GENERATE $0, $1, $2, $3..$4; Z2 = ORDER Z BY $0..$1; STORE Z2 INTO '/user/rlawrenc/416/lab3/q9_org'; DUMP T; /* Q9 - Most Players of Each Sex for a Game for each Publisher */ /* Determine player counts by game and gender */ PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); P = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); A = JOIN P BY pid, PG1 by pid; B = GROUP A BY (gid, sex); C = FOREACH B GENERATE FLATTEN(group), COUNT(A); /* Join with Games to get publisher info. */ G = load '/user/rlawrenc/416/lab2/small/games.txt' using PigStorage() AS (gid:int, name:chararray, publisher:chararray, release:chararray, rating:chararray); R = JOIN C BY gid, G BY gid; S = GROUP R BY (publisher, sex); /* Select the maximum # of players for each (publisher, sex) group. */ T = FOREACH S GENERATE FLATTEN(group), MAX(R.$2); STORE T INTO '/user/rlawrenc/416/lab3/q9'; DUMP T; /* Q10 - Game Breakdown by Gender */ /* Determine player counts by game and gender */ PG1 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); P = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); A = JOIN P BY pid, PG1 by pid; B = GROUP A BY (gid, sex); C = FOREACH B GENERATE FLATTEN(group), COUNT(A); /* Appear to need to build entire subtree again. Cannot just re-assign to another alias. e.g. C2=C */ PG2 = load '/user/rlawrenc/416/lab2/small/player_games.txt' using PigStorage() AS (pid:int, gid:int, score:int); P2 = load '/user/rlawrenc/416/lab2/small/players.txt' using PigStorage() AS (pid:int, fname:chararray, lname:chararray, bdate:chararray, sex:chararray, path:chararray); A2 = JOIN P2 BY pid, PG2 by pid; B2 = GROUP A2 BY (gid, sex); C2 = FOREACH B2 GENERATE FLATTEN(group), COUNT(A2); /* Join this relation with itself and calculate percentages */ R = JOIN C BY gid, C2 BY gid; S = FILTER R BY $1 != $4 and $1 == 'female'; /* Only keep 1 of 4 records with female entry first. $1 and $4 are sex fields. */ T = FOREACH S GENERATE $0, $2+$5, $1, $2, 1.0*$2/($2+$5), $4, $5, 1.0*$5/($2+$5); /* $2 and $5 fields are the computed counts. */ STORE T INTO '/user/rlawrenc/416/lab3/q10'; DUMP T;