I started looking at this and I think I've found a pretty efficient way to do upserts in sqlalchemy with a mix of bulk_insert_mappings
and bulk_update_mappings
instead of merge
.
import time
import sqlite3
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
from contextlib import contextmanager
engine = None
Session = sessionmaker()
Base = declarative_base()
def creat_new_database(db_name="sqlite:///bulk_upsert_sqlalchemy.db"):
global engine
engine = create_engine(db_name, echo=False)
local_session = scoped_session(Session)
local_session.remove()
local_session.configure(bind=engine, autoflush=False, expire_on_commit=False)
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
@contextmanager
def db_session():
local_session = scoped_session(Session)
session = local_session()
session.expire_on_commit = False
try:
yield session
except BaseException:
session.rollback()
raise
finally:
session.close()
class Customer(Base):
__tablename__ = "customer"
id = Column(Integer, primary_key=True)
name = Column(String(255))
def bulk_upsert_mappings(customers):
entries_to_update = []
entries_to_put = []
with db_session() as sess:
t0 = time.time()
# Find all customers that needs to be updated and build mappings
for each in (
sess.query(Customer.id).filter(Customer.id.in_(customers.keys())).all()
):
customer = customers.pop(each.id)
entries_to_update.append({"id": customer["id"], "name": customer["name"]})
# Bulk mappings for everything that needs to be inserted
for customer in customers.values():
entries_to_put.append({"id": customer["id"], "name": customer["name"]})
sess.bulk_insert_mappings(Customer, entries_to_put)
sess.bulk_update_mappings(Customer, entries_to_update)
sess.commit()
print(
"Total time for upsert with MAPPING update "
+ str(len(customers))
+ " records "
+ str(time.time() - t0)
+ " sec"
+ " inserted : "
+ str(len(entries_to_put))
+ " - updated : "
+ str(len(entries_to_update))
)
def bulk_upsert_merge(customers):
entries_to_update = 0
entries_to_put = []
with db_session() as sess:
t0 = time.time()
# Find all customers that needs to be updated and merge
for each in (
sess.query(Customer.id).filter(Customer.id.in_(customers.keys())).all()
):
values = customers.pop(each.id)
sess.merge(Customer(id=values["id"], name=values["name"]))
entries_to_update += 1
# Bulk mappings for everything that needs to be inserted
for customer in customers.values():
entries_to_put.append({"id": customer["id"], "name": customer["name"]})
sess.bulk_insert_mappings(Customer, entries_to_put)
sess.commit()
print(
"Total time for upsert with MERGE update "
+ str(len(customers))
+ " records "
+ str(time.time() - t0)
+ " sec"
+ " inserted : "
+ str(len(entries_to_put))
+ " - updated : "
+ str(entries_to_update)
)
if __name__ == "__main__":
batch_size = 10000
# Only inserts
customers_insert = {
i: {"id": i, "name": "customer_" + str(i)} for i in range(batch_size)
}
# 50/50 inserts update
customers_upsert = {
i: {"id": i, "name": "customer_2_" + str(i)}
for i in range(int(batch_size / 2), batch_size + int(batch_size / 2))
}
creat_new_database()
bulk_upsert_mappings(customers_insert.copy())
bulk_upsert_mappings(customers_upsert.copy())
bulk_upsert_mappings(customers_insert.copy())
creat_new_database()
bulk_upsert_merge(customers_insert.copy())
bulk_upsert_merge(customers_upsert.copy())
bulk_upsert_merge(customers_insert.copy())
The results for the benchmark:
Total time for upsert with MAPPING: 0.17138004302978516 sec inserted : 10000 - updated : 0
Total time for upsert with MAPPING: 0.22074174880981445 sec inserted : 5000 - updated : 5000
Total time for upsert with MAPPING: 0.22307634353637695 sec inserted : 0 - updated : 10000
Total time for upsert with MERGE: 0.1724097728729248 sec inserted : 10000 - updated : 0
Total time for upsert with MERGE: 7.852903842926025 sec inserted : 5000 - updated : 5000
Total time for upsert with MERGE: 15.11970829963684 sec inserted : 0 - updated : 10000