(WIP) improve pyarrow schema detection

plamut · plamut · commit fe2b6b9b7301 · 2019-10-01T19:55:44.000+02:00
Add more pyarrow types, convert to pyarrow only the columns the schema
could not be detected for, etc.
diff --git a/bigquery/google/cloud/bigquery/_pandas_helpers.py b/bigquery/google/cloud/bigquery/_pandas_helpers.py
@@ -110,13 +110,39 @@ def pyarrow_timestamp():
         "TIME": pyarrow_time,
         "TIMESTAMP": pyarrow_timestamp,
     }
-    ARROW_SCALARS_TO_BQ = {
-        arrow_type(): bq_type  # TODO: explain wht calling arrow_type()
-        for bq_type, arrow_type in BQ_TO_ARROW_SCALARS.items()
+    ARROW_SCALAR_IDS_TO_BQ = {
+        # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes
+        pyarrow.bool_().id: "BOOLEAN",
+        pyarrow.int8().id: "INT64",
+        pyarrow.int16().id: "INT64",
+        pyarrow.int32().id: "INT64",
+        pyarrow.int64().id: "INT64",
+        pyarrow.uint8().id: "INT64",
+        pyarrow.uint16().id: "INT64",
+        pyarrow.uint32().id: "INT64",
+        pyarrow.uint64().id: "INT64",
+        pyarrow.float16().id: "FLOAT64",
+        pyarrow.float32().id: "FLOAT64",
+        pyarrow.float64().id: "FLOAT64",
+        pyarrow.time32("ms").id: "TIME",
+        pyarrow.time64("ns").id: "TIME",
+        pyarrow.timestamp("ns").id: "TIMESTAMP",
+        pyarrow.date32().id: "DATE",
+        pyarrow.date64().id: "DATETIME",  # because millisecond resolution
+        pyarrow.binary().id: "BYTES",
+        pyarrow.string().id: "STRING",  # also alias for pyarrow.utf8()
+        pyarrow.decimal128(
+            38, scale=9
+        ).id: "NUMERIC",  # TODO: what scale and precision?
+        # ... does not matter, as only the type is important?
     }
+    # TODO: what about geography? represented as string, but that is already mapped
+    # to the STRING type
+    # TODO: add additional unit tests covering these types
+
 else:  # pragma: NO COVER
     BQ_TO_ARROW_SCALARS = {}  # pragma: NO COVER
-    ARROW_SCALARS_TO_BQ = {}  # pragma: NO_COVER
+    ARROW_SCALAR_IDS_TO_BQ = {}  # pragma: NO_COVER
 
 
 def bq_to_arrow_struct_data_type(field):
@@ -279,6 +305,7 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
 
         # Otherwise, try to automatically determine the type based on the
         # pandas dtype.
+        # TODO: make a function for Arrow type + field name -> BQ field
         bq_type = _PANDAS_DTYPE_TO_BQ.get(dtype.name)
         if not bq_type:
             warnings.warn(u"Unable to determine type of column '{}'.".format(column))
@@ -300,39 +327,54 @@ def dataframe_to_bq_schema(dataframe, bq_schema):
     if any(field.field_type is None for field in bq_schema_out):
         if not pyarrow:
             return None  # We cannot detect the schema in full.
+        bq_schema_out = _currate_schema(dataframe, bq_schema_out)
+
+    return tuple(bq_schema_out)
 
-        arrow_table = dataframe_to_arrow(dataframe, bq_schema_out)
-        arrow_schema_index = {field.name: field.type for field in arrow_table}
 
-        currated_schema = []
-        for schema_field in bq_schema_out:
-            if schema_field.field_type is not None:
-                currated_schema.append(schema_field)
-                continue
+def _currate_schema(dataframe, current_bq_schema):
+    """TODO: docstring... and tests... and explain
 
-            detected_type = ARROW_SCALARS_TO_BQ.get(
-                arrow_schema_index.get(schema_field.name)
-            )
-            if detected_type is None:
-                warnings.warn(
-                    u"Pyarrow could not determine the type of column '{}'.".format(
-                        schema_field.name
-                    )
+    and that it requires pyarrow to run
+    """
+    arrow_tables = {}
+
+    for field in current_bq_schema:
+        if field.field_type is None:
+            arrow_table = pyarrow.array(dataframe[field.name])
+            arrow_tables[field.name] = arrow_table
+
+    arrow_schema_index = {
+        field_name: field.type for field_name, field in arrow_tables.items()
+    }
+
+    currated_schema = []
+    for schema_field in current_bq_schema:
+        if schema_field.field_type is not None:
+            currated_schema.append(schema_field)
+            continue
+
+        detected_type = ARROW_SCALAR_IDS_TO_BQ.get(
+            arrow_schema_index.get(schema_field.name).id
+        )
+        if detected_type is None:
+            warnings.warn(
+                u"Pyarrow could not determine the type of column '{}'.".format(
+                    schema_field.name
                 )
-                return None
-
-            new_field = schema.SchemaField(
-                name=schema_field.name,
-                field_type=detected_type,
-                mode=schema_field.mode,
-                description=schema_field.description,
-                fields=schema_field.fields,
             )
-            currated_schema.append(new_field)
+            return None
 
-        bq_schema_out = currated_schema
+        new_field = schema.SchemaField(
+            name=schema_field.name,
+            field_type=detected_type,
+            mode=schema_field.mode,
+            description=schema_field.description,
+            fields=schema_field.fields,
+        )
+        currated_schema.append(new_field)
 
-    return tuple(bq_schema_out)
+    return currated_schema
 
 
 def dataframe_to_arrow(dataframe, bq_schema):