본문 바로가기

Develop/DataProcess

[HIve] get access log from RegexSerDe

in official Example:

CREATE EXTERNAL TABLE intermediate_access_logs (

    ip STRING,

    date STRING,

    method STRING,

    url STRING,

    http_version STRING,

    code1 STRING,

    code2 STRING,

    dash STRING,

    user_agent STRING)

ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'

WITH SERDEPROPERTIES (

    'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) (\\d*) "([^"]*)" "([^"]*)"',

    'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s")

LOCATION '/user/hive/warehouse/original_access_logs';


there is null rows when we don't have dash and user_agent values.

so..need to modify input.regex

CREATE EXTERNAL TABLE intermediate_access_logs (

    ip STRING,

    date STRING,

    method STRING,

    url STRING,

    http_version STRING,

    code1 STRING,

    code2 STRING,

    dash STRING,

    user_agent STRING)

ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'

WITH SERDEPROPERTIES (

    'input.regex' = '([^ ]*) - - \\[([^\\]]*)\\] "([^\ ]*) ([^\ ]*) ([^\ ]*)" (\\d*) ([\\d]*|-)(?: ([^ "]*|"[^"]*") ([^ "]*|"[^"]*"))?',

    'output.format.string' = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s")

LOCATION '/user/hive/warehouse/original_access_logs';


it's fine!