minio/mint/run/core/s3select/csv.py

167 lines
6.8 KiB
Python
Raw Normal View History

2020-05-15 14:20:57 -04:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
2020-05-15 14:20:57 -04:00
from minio import Minio
from minio.select import (COMPRESSION_TYPE_NONE, FILE_HEADER_INFO_NONE,
JSON_TYPE_DOCUMENT, QUOTE_FIELDS_ALWAYS,
QUOTE_FIELDS_ASNEEDED, CSVInputSerialization,
CSVOutputSerialization, JSONInputSerialization,
JSONOutputSerialization, SelectRequest)
2020-05-15 14:20:57 -04:00
from utils import *
2020-05-15 14:20:57 -04:00
def test_sql_api(test_name, client, bucket_name, input_data, sql_opts, expected_output):
""" Test if the passed SQL request has the output equal to the passed execpted one"""
object_name = generate_object_name()
got_output = b''
try:
bytes_content = io.BytesIO(input_data)
client.put_object(bucket_name, object_name,
io.BytesIO(input_data), len(input_data))
2020-05-15 14:20:57 -04:00
data = client.select_object_content(bucket_name, object_name, sql_opts)
# Get the records
records = io.BytesIO()
for d in data.stream(10*1024):
records.write(d)
got_output = records.getvalue()
2020-05-15 14:20:57 -04:00
except Exception as select_err:
if not isinstance(expected_output, Exception):
raise ValueError(
'Test {} unexpectedly failed with: {}'.format(test_name, select_err))
2020-05-15 14:20:57 -04:00
else:
if isinstance(expected_output, Exception):
raise ValueError(
'Test {}: expected an exception, got {}'.format(test_name, got_output))
2020-05-15 14:20:57 -04:00
if got_output != expected_output:
raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format(
test_name, expected_output, got_output))
2020-05-15 14:20:57 -04:00
finally:
client.remove_object(bucket_name, object_name)
def test_csv_input_custom_quote_char(client, log_output):
# Get a unique bucket_name and object_name
log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
tests = [
# Invalid quote character, should fail
('""', '"', b'col1,col2,col3\n', Exception()),
# UTF-8 quote character
('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(),
b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
# Only one field is quoted
('"', '"', b'"col1",col2,col3\n',
b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'),
('\'', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1","col2","col3"\n',
b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'),
('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'),
('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'),
('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'),
('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'),
('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'),
('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'),
('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'),
('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'),
('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'),
2020-05-15 14:20:57 -04:00
]
client.make_bucket(bucket_name)
try:
for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests):
sql_opts = SelectRequest(
"select * from s3object",
CSVInputSerialization(
compression_type=COMPRESSION_TYPE_NONE,
file_header_info=FILE_HEADER_INFO_NONE,
record_delimiter="\n",
field_delimiter=",",
quote_character=quote_char,
quote_escape_character=escape_char,
comments="#",
allow_quoted_record_delimiter="FALSE",
),
JSONOutputSerialization(
record_delimiter="\n",
),
request_progress=False,
)
test_sql_api(f'test_{idx}', client, bucket_name,
data, sql_opts, expected_output)
2020-05-15 14:20:57 -04:00
finally:
client.remove_bucket(bucket_name)
# Test passes
print(log_output.json_report())
2020-05-15 14:20:57 -04:00
def test_csv_output_custom_quote_char(client, log_output):
# Get a unique bucket_name and object_name
log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
tests = [
# UTF-8 quote character
("''", "''", b'col1,col2,col3\n', Exception()),
("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'),
('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'),
('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'),
('"', '"', b'""""\n', b'""""\n'),
('"', '"', b'\n', b''),
("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"),
("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"),
("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"),
("'", "\\", b'col\'\n', b"'col\\''\n"),
# Two consecutive escaped quotes
("'", "\\", b'"a"""""\n', b"'a\"\"'\n"),
2020-05-15 14:20:57 -04:00
]
client.make_bucket(bucket_name)
try:
for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests):
sql_opts = SelectRequest(
"select * from s3object",
CSVInputSerialization(
compression_type=COMPRESSION_TYPE_NONE,
file_header_info=FILE_HEADER_INFO_NONE,
record_delimiter="\n",
field_delimiter=",",
quote_character='"',
quote_escape_character='"',
comments="#",
allow_quoted_record_delimiter="FALSE",
),
CSVOutputSerialization(
quote_fields=QUOTE_FIELDS_ALWAYS,
record_delimiter="\n",
field_delimiter=",",
quote_character=quote_char,
quote_escape_character=escape_char,
),
request_progress=False,
)
test_sql_api(f'test_{idx}', client, bucket_name,
input_data, sql_opts, expected_output)
2020-05-15 14:20:57 -04:00
finally:
client.remove_bucket(bucket_name)
# Test passes
print(log_output.json_report())