minio/mint/run/core/s3select/csv.py
Krishnan Parthasarathi c829e3a13b Support for remote tier management (#12090)
With this change, MinIO's ILM supports transitioning objects to a remote tier.
This change includes support for Azure Blob Storage, AWS S3 compatible object
storage incl. MinIO and Google Cloud Storage as remote tier storage backends.

Some new additions include:

 - Admin APIs remote tier configuration management

 - Simple journal to track remote objects to be 'collected'
   This is used by object API handlers which 'mutate' object versions by
   overwriting/replacing content (Put/CopyObject) or removing the version
   itself (e.g DeleteObjectVersion).

 - Rework of previous ILM transition to fit the new model
   In the new model, a storage class (a.k.a remote tier) is defined by the
   'remote' object storage type (one of s3, azure, GCS), bucket name and a
   prefix.

* Fixed bugs, review comments, and more unit-tests

- Leverage inline small object feature
- Migrate legacy objects to the latest object format before transitioning
- Fix restore to particular version if specified
- Extend SharedDataDirCount to handle transitioned and restored objects
- Restore-object should accept version-id for version-suspended bucket (#12091)
- Check if remote tier creds have sufficient permissions
- Bonus minor fixes to existing error messages

Co-authored-by: Poorna Krishnamoorthy <poorna@minio.io>
Co-authored-by: Krishna Srinivas <krishna@minio.io>
Signed-off-by: Harshavardhana <harsha@minio.io>
2021-04-23 11:58:53 -07:00

167 lines
6.8 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
from minio import Minio
from minio.select import (COMPRESSION_TYPE_NONE, FILE_HEADER_INFO_NONE,
JSON_TYPE_DOCUMENT, QUOTE_FIELDS_ALWAYS,
QUOTE_FIELDS_ASNEEDED, CSVInputSerialization,
CSVOutputSerialization, JSONInputSerialization,
JSONOutputSerialization, SelectRequest)
from utils import *
def test_sql_api(test_name, client, bucket_name, input_data, sql_opts, expected_output):
""" Test if the passed SQL request has the output equal to the passed execpted one"""
object_name = generate_object_name()
got_output = b''
try:
bytes_content = io.BytesIO(input_data)
client.put_object(bucket_name, object_name,
io.BytesIO(input_data), len(input_data))
data = client.select_object_content(bucket_name, object_name, sql_opts)
# Get the records
records = io.BytesIO()
for d in data.stream(10*1024):
records.write(d)
got_output = records.getvalue()
except Exception as select_err:
if not isinstance(expected_output, Exception):
raise ValueError(
'Test {} unexpectedly failed with: {}'.format(test_name, select_err))
else:
if isinstance(expected_output, Exception):
raise ValueError(
'Test {}: expected an exception, got {}'.format(test_name, got_output))
if got_output != expected_output:
raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format(
test_name, expected_output, got_output))
finally:
client.remove_object(bucket_name, object_name)
def test_csv_input_custom_quote_char(client, log_output):
# Get a unique bucket_name and object_name
log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
tests = [
# Invalid quote character, should fail
('""', '"', b'col1,col2,col3\n', Exception()),
# UTF-8 quote character
('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(),
b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
# Only one field is quoted
('"', '"', b'"col1",col2,col3\n',
b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'),
('\'', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1",col2,col3\n',
b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
('', '"', b'"col1","col2","col3"\n',
b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'),
('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'),
('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'),
('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'),
('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'),
('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'),
('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'),
('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'),
('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'),
('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'),
]
client.make_bucket(bucket_name)
try:
for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests):
sql_opts = SelectRequest(
"select * from s3object",
CSVInputSerialization(
compression_type=COMPRESSION_TYPE_NONE,
file_header_info=FILE_HEADER_INFO_NONE,
record_delimiter="\n",
field_delimiter=",",
quote_character=quote_char,
quote_escape_character=escape_char,
comments="#",
allow_quoted_record_delimiter="FALSE",
),
JSONOutputSerialization(
record_delimiter="\n",
),
request_progress=False,
)
test_sql_api(f'test_{idx}', client, bucket_name,
data, sql_opts, expected_output)
finally:
client.remove_bucket(bucket_name)
# Test passes
print(log_output.json_report())
def test_csv_output_custom_quote_char(client, log_output):
# Get a unique bucket_name and object_name
log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
tests = [
# UTF-8 quote character
("''", "''", b'col1,col2,col3\n', Exception()),
("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'),
('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'),
('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'),
('"', '"', b'""""\n', b'""""\n'),
('"', '"', b'\n', b''),
("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"),
("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"),
("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"),
("'", "\\", b'col\'\n', b"'col\\''\n"),
# Two consecutive escaped quotes
("'", "\\", b'"a"""""\n', b"'a\"\"'\n"),
]
client.make_bucket(bucket_name)
try:
for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests):
sql_opts = SelectRequest(
"select * from s3object",
CSVInputSerialization(
compression_type=COMPRESSION_TYPE_NONE,
file_header_info=FILE_HEADER_INFO_NONE,
record_delimiter="\n",
field_delimiter=",",
quote_character='"',
quote_escape_character='"',
comments="#",
allow_quoted_record_delimiter="FALSE",
),
CSVOutputSerialization(
quote_fields=QUOTE_FIELDS_ALWAYS,
record_delimiter="\n",
field_delimiter=",",
quote_character=quote_char,
quote_escape_character=escape_char,
),
request_progress=False,
)
test_sql_api(f'test_{idx}', client, bucket_name,
input_data, sql_opts, expected_output)
finally:
client.remove_bucket(bucket_name)
# Test passes
print(log_output.json_report())