tvl-depot/users/flokli/archeology/parse_bucket_logs.rs
Florian Klink 46964f6d8f fix(users/flokli/archaeology): don't use file but column compression
Clickhouse also has column compression, configurable with the
output_format_parquet_compression_method setting.

It defaults to lz4, and the previous setting got a a zstd-compressed
parquet file with lz4 data.

Set output_format_parquet_compression_method to zstd instead, and sort
by timestamp before assembling the parquet file.

The existing files were updated to the same format with the following query:

```
SELECT * FROM file('bucket_logs_2023-11-11*.pq', 'Parquet', 'auto') ORDER BY timestamp ASC INTO OUTFILE 'bucket_logs_2023-11-11.parquet' SETTINGS output_format_parquet_compression_method = 'zstd'
```

Change-Id: Id63b14c82e7bf4b9907a500528b569a51e277751
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10008
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
2023-11-11 19:49:13 +00:00

41 lines
1.9 KiB
Rust

use std::env;
use std::process::Command;
use std::process::ExitCode;
fn main() -> ExitCode {
let args: Vec<String> = env::args().collect();
if args.len() != 3 {
eprintln!("needs two args, input s3 url (glob) and output pq file");
return ExitCode::FAILURE;
}
let input_files = &args[1];
let output_file = &args[2];
let mut cmd = Command::new("clickhouse-local");
cmd.arg("--progress")
.arg("-q")
.arg(format!(r#"SELECT
key,
toInt64(nullif(http_status, '-')) AS http_status,
toInt64(nullif(object_size_str, '-')) AS object_size,
toInt64(nullif(bytes_sent_str, '-')) AS bytes_sent,
nullif(user_agent, '-') AS user_agent,
operation,
nullif(requester, '-') AS requester,
parseDateTime(timestamp_str, '%d/%b/%Y:%k:%i:%s %z') AS timestamp
FROM s3(
'{}',
'Regexp',
'owner String , bucket String, timestamp_str String, remote_ip String, requester LowCardinality(String), request_id String, operation LowCardinality(String), key String, request_uri String, http_status String, error_code String, bytes_sent_str String, object_size_str String, total_time String, turn_around_time String, referer String, user_agent String, version_id String, host_id String, signature_version String, cipher_suite String, authentication_type String, host_header String, tls_version String, access_point_arn String, acl_required String'
)
ORDER BY timestamp ASC
SETTINGS
format_regexp = '(\\S+) (\\S+) \\[(.*)\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) ((?:\\S+ \\S+ \\S+)|\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+).*',
output_format_parquet_compression_method = 'zstd'
INTO OUTFILE '{}' FORMAT Parquet"#, input_files, output_file));
cmd.status().expect("clickhouse-local failed");
ExitCode::SUCCESS
}