hive在最近的日期左连接

iezvtpos  于 2021-05-29  发布在  Hadoop
关注(0)|答案(4)|浏览(316)

我试图在连接时使用一个键和两个表中最近的日期连接配置单元中的两个表。例如:下面是两个输入表

<----------TABLE A------------->            <------------TABLE B------------>
A_id    A_date      changed_col             B_id    B_date      B_value A_id

**********  ***********         **********  ************ 

A01     2017-03-20      ABC                 B01     2017-04-02  200     A01
A01     2017-04-01      XYZ                 B01     2017-04-04  500     A01
A01     2017-04-05      LLL

但是,当我将表b与表a连接在一起时,它应该在表a中为同一个键(a\ id)查找最近的最低日期。以下是预期输出表:

B_id    B_date          A_id        A_date      changed col   B_value

**********      ****    ******  ******************

B01     2017-02-04      A01     2017-01-04      XYZ             200
B01     2017-04-04      A01     2017-01-04      XYZ             500

非常感谢您的帮助。谢谢

zysjyyx4

zysjyyx41#

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_date
       ,A.changed_col
       ,B_value

from                B

        left join  (select  *

                    from   (select  B.B_id
                                   ,A.A_date
                                   ,A.changed_col

                                   ,row_number () over
                                    (
                                        partition by    B.B_id
                                        order by        A.A_date desc
                                    ) as rn
                            from            B
                                    join    A
                                    on      A.A_id = B.A_id
                            where   A.A_date <= B.B_date 
                            ) A

                    where rn = 1
                    ) A

        on          A.B_id  =
                    B.B_id
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+
ycl3bljg

ycl3bljg2#

select  B.B_id
       ,B.B_date
       ,B.A_id
       ,A.A_data.A_date         as A_date 
       ,A.A_data.changed_col    as changed_col
       ,B_value

from                B

        left join  (select      B.B_id
                               ,max (named_struct ('A_date',A_date,'changed_col',changed_col)) as A_data

                    from                B
                                join    A
                                on      A.A_id = B.A_id

                    where       A.A_date <= B.B_date 

                    group by    B.B_id
                    ) A

        on          A.B_id  =
                    B.B_id
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+
lrpiutwd

lrpiutwd3#

另一种选择是:

select B.B_id, B.B_date, B.A_id, A1.A_id, A1.A_date, A1.changed_col, B.B_value
from 
TABLE_B as B,
(
select A_id, A_date, chaged_col
from TABLE_A as A
where A_date = (select min(A_date) from TABLE_A where A_id = A.A_id) ) as A1
where B.A_id = A.A_id
gk7wooem

gk7wooem4#

select  B_id
       ,dt                  as B_date
       ,A_id
       ,A_data.A_date       as A_date 
       ,A_data.changed_col
       ,B_value

from   (select  B_id,dt,B_value,A_id,tab               
               ,max 
                (   case 
                        when tab = 'A' 
                        then named_struct ('A_date',dt,'changed_col',changed_col) 
                    end
                ) over
                (
                    partition by    A_id
                    order by        dt,tab
                    rows            between unbounded preceding
                                    and     current row
                ) as A_data

        from   (select  B_id,B_date as dt,B_value,A_id
                       ,'B' as tab,null as changed_col
                from    B                

                union all

                select  null as B_id,A_date as dt,null as B_value,A_id
                       ,'A' as tab,changed_col
                from    A
                ) t
        ) t

where   tab = 'B'
+------+------------+------+------------+-------------+---------+
| b_id |   b_date   | a_id |   a_date   | changed_col | b_value |
+------+------------+------+------------+-------------+---------+
| B01  | 2017-04-02 | A01  | 2017-04-01 | XYZ         |     200 |
| B01  | 2017-04-04 | A01  | 2017-04-01 | XYZ         |     500 |
+------+------------+------+------------+-------------+---------+

相关问题