I found loading through a CSV very difficult. More restrictions and complications. I have been messing around this morning with moving data from MySQL to BigQuery.
Bellow is a Python script that will build the table decorator and stream the data directly into the BigQuery table.
My db is in the Cloud so you may need to change the connection string. Fill in the missing values for your particular situation then call it by:
SQLToBQBatch(tableName, limit)
I put the limit in to test with. For my final test I sent 999999999 for the limit and everything worked fine.
I would recommend using a backend module to run this over 5g.
Use "RowToJSON" to clean up and invalid characters (ie anything non utf8).
I haven't tested on 5gb but it was able to do 50k rows in about 20 seconds. The same load in CSV was over 2 minutes.
I wrote this to test things, so please excuse the bad codding practices and mini hacks. It works so feel free to clean it up for any production level work.
import MySQLdb
import logging
from apiclient.discovery import build
from oauth2client.appengine import AppAssertionCredentials
import httplib2
OAUTH_SCOPE = 'https://www.googleapis.com/auth/bigquery'
PROJECT_ID =
DATASET_ID =
TABLE_ID =
SQL_DATABASE_NAME =
SQL_DATABASE_DB =
SQL_USER =
SQL_PASS =
def Connect():
return MySQLdb.connect(unix_socket='/cloudsql/' + SQL_DATABASE_NAME, db=SQL_DATABASE_DB, user=SQL_USER, passwd=SQL_PASS)
def RowToJSON(cursor, row, fields):
newData = {}
for i, value in enumerate(row):
try:
if fields[i]["type"] == bqTypeDict["int"]:
value = int(value)
else:
value = float(value)
except:
if value is not None:
value = value.replace("\x92", "'") \
.replace("\x96", "'") \
.replace("\x93", '"') \
.replace("\x94", '"') \
.replace("\x97", '-') \
.replace("\xe9", 'e') \
.replace("\x91", "'") \
.replace("\x85", "...") \
.replace("\xb4", "'") \
.replace('"', '""')
newData[cursor.description[i][0]] = value
return newData
def GetBuilder():
return build('bigquery', 'v2',http = AppAssertionCredentials(scope=OAUTH_SCOPE).authorize(httplib2.Http()))
bqTypeDict = { 'int' : 'INTEGER',
'varchar' : 'STRING',
'double' : 'FLOAT',
'tinyint' : 'INTEGER',
'decimal' : 'FLOAT',
'text' : 'STRING',
'smallint' : 'INTEGER',
'char' : 'STRING',
'bigint' : 'INTEGER',
'float' : 'FLOAT',
'longtext' : 'STRING'
}
def BuildFeilds(table):
conn = Connect()
cursor = conn.cursor()
cursor.execute("DESCRIBE %s;" % table)
tableDecorator = cursor.fetchall()
fields = []
for col in tableDecorator:
field = {}
field["name"] = col[0]
colType = col[1].split("(")[0]
if colType not in bqTypeDict:
logging.warning("Unknown type detected, using string: %s", str(col[1]))
field["type"] = bqTypeDict.get(colType, "STRING")
if col[2] == "YES":
field["mode"] = "NULLABLE"
fields.append(field)
return fields
def SQLToBQBatch(table, limit=3000):
logging.info("****************************************************")
logging.info("Starting SQLToBQBatch. Got: Table: %s, Limit: %i" % (table, limit))
bqDest = GetBuilder()
fields = BuildFeilds(table)
try:
responce = bqDest.datasets().insert(projectId=PROJECT_ID, body={'datasetReference' :
{'datasetId' : DATASET_ID} }).execute()
logging.info("Added Dataset")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Dataset already exists")
else:
logging.error("Error creating dataset: " + str(e), "Error")
try:
responce = bqDest.tables().insert(projectId=PROJECT_ID, datasetId=DATASET_ID, body={'tableReference' : {'projectId' : PROJECT_ID,
'datasetId' : DATASET_ID,
'tableId' : TABLE_ID},
'schema' : {'fields' : fields}}
).execute()
logging.info("Added Table")
logging.info(responce)
except Exception, e:
logging.info(e)
if ("Already Exists: " in str(e)):
logging.info("Table already exists")
else:
logging.error("Error creating table: " + str(e), "Error")
conn = Connect()
cursor = conn.cursor()
logging.info("Starting load loop")
count = -1
cur_pos = 0
total = 0
batch_size = 1000
while count != 0 and cur_pos < limit:
count = 0
if batch_size + cur_pos > limit:
batch_size = limit - cur_pos
sqlCommand = "SELECT * FROM %s LIMIT %i, %i" % (table, cur_pos, batch_size)
logging.info("Running: %s", sqlCommand)
cursor.execute(sqlCommand)
data = []
for _, row in enumerate(cursor.fetchall()):
data.append({"json": RowToJSON(cursor, row, fields)})
count += 1
logging.info("Read complete")
if count != 0:
logging.info("Sending request")
insertResponse = bqDest.tabledata().insertAll(
projectId=PROJECT_ID,
datasetId=DATASET_ID,
tableId=TABLE_ID,
body={"rows":data}).execute()
cur_pos += batch_size
total += count
logging.info("Done %i, Total: %i, Response: %s", count, total, insertResponse)
if "insertErrors" in insertResponse:
logging.error("Error inserting data index: %i", insertResponse["insertErrors"]["index"])
for error in insertResponse["insertErrors"]["errors"]:
logging.error(error)
else:
logging.info("No more rows")