A = LOAD 'input' USING PigStorage(',') AS(index:int,x:double,y:double);
B = FILTER A BY index>=1;
C = FILTER A BY index>1;
D = FOREACH C GENERATE ($0-1) AS dindex,index,x,y;
E = JOIN B BY index, D BY dindex;
F = FOREACH E GENERATE TOBAG(TOTUPLE(B::index,B::x,B::y),TOTUPLE(D::index,D::x,D::y));
DUMP F;
-- load relation
R = LOAD 'data.txt' USING PigStorage(',') AS (index,x,y);
-- project each tuple to 2 different keys
-- one with index and one with index+1
R1 = FOREACH R GENERATE index+0, index, x, y;
R2 = FOREACH R GENERATE index+1, index, x, y;
-- group
result = COGROUP R1 by $0, R2 by $0;
-- clean out wrong combinations
result2 = filter result by NOT(IsEmpty(R1)) and NOT(IsEmpty(R2));
-- flatten the results
result3 = FOREACH result2 GENERATE FLATTEN(R1), FLATTEN(R2);
result4 = FOREACH result3 GENERATE (R1::index,R1::x,R1::y), (R2::index,R2::x,R2::y);
2条答案
按热度按时间vm0i2vca1#
下面的方法将适用于您的情况。
输入:
Pig手稿:
输出:
7fhtutme2#
您可以使用以下查询(注解中的解释)。
我用来测试的文件包含以下内容:
请注意,括号不存在,但可以使用简单的预处理脚本将其过滤掉。
中间结果的转储为:
DUMP R;
(1,0.0,0.0) (2,-0.1,-0.1) (3,1.0,-2.2)
DUMP R1;
((1,1,0.0,0.0)) ((2,2,-0.1,-0.1)) ((3,3,1.0,-2.2))
DUMP R2;
((1,1,0.0,0.0)) ((2,2,-0.1,-0.1)) ((3,3,1.0,-2.2))
DUMP result;
(1,{(1,1,0.0,0.0)},{}) (2,{(2,2,-0.1,-0.1)},{(2,1,0.0,0.0)}) (3,{(3,3,1.0,-2.2)},{(3,2,-0.1,-0.1)}) (4,{},{(4,3,1.0,-2.2)})
DUMP result2;
(2,{(2,2,-0.1,-0.1)},{(2,1,0.0,0.0)}) (3,{(3,3,1.0,-2.2)},{(3,2,-0.1,-0.1)})
DUMP result3;
(2,2,-0.1,-0.1,2,1,0.0,0.0) (3,3,1.0,-2.2,3,2,-0.1,-0.1)
DUMP result4;
```((2,-0.1,-0.1),(1,0.0,0.0))
((3,1.0,-2.2),(2,-0.1,-0.1))