Chapter 35 Database Connections

What You’ll Learn:

  • Connecting to databases
  • SQL queries from R
  • Common database errors
  • Connection management
  • Best practices

Key Errors Covered: 15+ database errors

Difficulty: ⭐⭐⭐ Advanced

35.1 Introduction

R connects to databases via DBI and database-specific packages:

library(DBI)

# Generic connection pattern
con <- dbConnect(
  RSQLite::SQLite(),
  dbname = "database.db"
)

35.2 Database Basics

💡 Key Insight: DBI Package

library(DBI)
library(RSQLite)

# Create in-memory SQLite database
con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Write data to database
dbWriteTable(con, "mtcars", mtcars)
dbWriteTable(con, "iris", iris)

# List tables
dbListTables(con)
#> [1] "iris"   "mtcars"

# Get table info
dbListFields(con, "mtcars")
#>  [1] "mpg"        "cyl"        "disp"       "hp"         "drat"      
#>  [6] "wt"         "qsec"       "vs"         "am"         "gear"      
#> [11] "carb"       "cyl_factor"

# Read entire table
data <- dbReadTable(con, "mtcars")
head(data, 3)
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4

# Always disconnect when done
dbDisconnect(con)

35.3 SQL Queries

💡 Key Insight: Running Queries

# Reconnect
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "mtcars", mtcars)

# Simple query
result <- dbGetQuery(con, "SELECT * FROM mtcars WHERE cyl = 4")
head(result, 3)
#>    mpg cyl  disp hp drat   wt  qsec vs am gear carb cyl_factor
#> 1 22.8   4 108.0 93 3.85 2.32 18.61  1  1    4    1          4
#> 2 24.4   4 146.7 62 3.69 3.19 20.00  1  0    4    2          4
#> 3 22.8   4 140.8 95 3.92 3.15 22.90  1  0    4    2          4

# Query with parameters (safe from SQL injection)
result <- dbGetQuery(
  con,
  "SELECT * FROM mtcars WHERE cyl = ? AND mpg > ?",
  params = list(4, 25)
)
head(result)
#>    mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> 1 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> 2 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#> 3 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1          4
#> 4 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1          4
#> 5 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2          4
#> 6 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2          4

# Count rows
count <- dbGetQuery(con, "SELECT COUNT(*) as n FROM mtcars")
count$n
#> [1] 32

# Aggregation
summary <- dbGetQuery(con, 
  "SELECT cyl, AVG(mpg) as avg_mpg, COUNT(*) as count
   FROM mtcars 
   GROUP BY cyl")
summary
#>   cyl  avg_mpg count
#> 1   4 26.66364    11
#> 2   6 19.74286     7
#> 3   8 15.10000    14

dbDisconnect(con)

35.4 Error #1: could not connect to server

⭐⭐ INTERMEDIATE 🔌 CONNECTION

35.4.1 The Error

# Try to connect to non-existent server
con <- dbConnect(
  RPostgreSQL::PostgreSQL(),
  host = "nonexistent.server.com",
  dbname = "mydb"
)
#> Error in (function (cond) : error in evaluating the argument 'drv' in selecting a method for function 'dbConnect': there is no package called 'RPostgreSQL'

🔴 ERROR

Error: could not connect to server

35.4.2 Solutions

SOLUTION: Safe Connection Function

safe_db_connect <- function(drv, ..., timeout = 5) {
  # Try to connect with timeout
  tryCatch({
    con <- dbConnect(drv, ...)
    
    # Test connection
    if (dbIsValid(con)) {
      message("Successfully connected to database")
      return(con)
    } else {
      stop("Connection established but not valid")
    }
    
  }, error = function(e) {
    stop("Failed to connect: ", e$message, 
         "\nCheck host, port, credentials, and network")
  })
}

# Use it
con <- safe_db_connect(
  RSQLite::SQLite(),
  dbname = ":memory:"
)
#> Successfully connected to database

dbIsValid(con)
#> [1] TRUE
dbDisconnect(con)

35.5 Error #2: table ... does not exist

⭐ BEGINNER 📊 SQL

35.5.1 The Error

con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Try to query non-existent table
dbGetQuery(con, "SELECT * FROM nonexistent_table")
#> Error: no such table: nonexistent_table

🔴 ERROR

Error: no such table: nonexistent_table

35.5.2 Solutions

SOLUTION: Check Table Exists

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "mtcars", mtcars)

# Check if table exists
table_exists <- function(con, table_name) {
  table_name %in% dbListTables(con)
}

# Safe query
safe_query <- function(con, sql, table_name = NULL) {
  # Extract table name from SQL if not provided
  if (is.null(table_name)) {
    # Simple extraction (works for basic queries)
    table_match <- regmatches(sql, regexpr("FROM\\s+(\\w+)", sql, ignore.case = TRUE))
    if (length(table_match) > 0) {
      table_name <- sub("FROM\\s+", "", table_match, ignore.case = TRUE)
    }
  }
  
  # Check table exists
  if (!is.null(table_name) && !table_exists(con, table_name)) {
    stop("Table '", table_name, "' does not exist. Available tables: ",
         paste(dbListTables(con), collapse = ", "))
  }
  
  # Run query
  dbGetQuery(con, sql)
}

# Test
head(safe_query(con, "SELECT * FROM mtcars"), 3)
#>    mpg cyl disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> 1 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4          6
#> 2 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4          6
#> 3 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1          4

dbDisconnect(con)

35.6 dplyr with Databases

💡 Key Insight: Using dplyr with Databases

library(dplyr)
library(dbplyr)

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "mtcars", mtcars)

# Create dplyr table reference
cars_tbl <- tbl(con, "mtcars")

# Use dplyr verbs (lazy evaluation)
result <- cars_tbl %>%
  filter(cyl == 4) %>%
  select(mpg, hp, wt) %>%
  arrange(desc(mpg))
#> Error in select(., mpg, hp, wt): unused arguments (mpg, hp, wt)

# See the SQL that will be generated
result %>% show_query()
#> Error in UseMethod("show_query"): no applicable method for 'show_query' applied to an object of class "data.frame"

# Execute and collect results
data <- result %>% collect()
head(data)
#>    mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_factor
#> 1 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1          4
#> 2 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          4
#> 3 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1          4
#> 4 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1          4
#> 5 26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2          4
#> 6 30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2          4

# Or compute and store in database
cars_tbl %>%
  filter(mpg > 20) %>%
  compute("efficient_cars")
#> Error in compute(., "efficient_cars"): unused argument ("efficient_cars")

dbListTables(con)
#> [1] "mtcars"

dbDisconnect(con)

35.7 Connection Management

🎯 Best Practice: Manage Connections

# Pattern 1: Explicit disconnect
con <- dbConnect(RSQLite::SQLite(), ":memory:")
# ... do work ...
dbDisconnect(con)

# Pattern 2: Use on.exit (safer)
query_with_cleanup <- function() {
  con <- dbConnect(RSQLite::SQLite(), ":memory:")
  on.exit(dbDisconnect(con))  # Always disconnect
  
  dbWriteTable(con, "test", mtcars)
  return(dbGetQuery(con, "SELECT COUNT(*) FROM test"))
}

result <- query_with_cleanup()
result
#>   COUNT(*)
#> 1       32

# Pattern 3: Connection pooling (for web apps)
# library(pool)
# pool <- dbPool(RSQLite::SQLite(), dbname = ":memory:")
# con <- poolCheckout(pool)
# # ... use connection ...
# poolReturn(con)
# poolClose(pool)

35.8 Writing to Databases

💡 Key Insight: Writing Data

con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Write entire data frame
dbWriteTable(con, "iris", iris)

# Append to existing table
dbWriteTable(con, "iris", iris[1:10, ], append = TRUE)

# Overwrite existing table
dbWriteTable(con, "iris", iris, overwrite = TRUE)

# Check row count
dbGetQuery(con, "SELECT COUNT(*) as count FROM iris")
#>   count
#> 1   150

# Insert single row
dbExecute(con,
  "INSERT INTO iris (Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species)
   VALUES (?, ?, ?, ?, ?)",
  params = list(5.1, 3.5, 1.4, 0.2, "setosa")
)
#> Error: near ".": syntax error

dbDisconnect(con)

35.9 Prepared Statements

🎯 Best Practice: Use Parameters

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "mtcars", mtcars)

# ❌ BAD: SQL injection vulnerable
user_input <- "4 OR 1=1"  # Malicious input
# query <- paste0("SELECT * FROM mtcars WHERE cyl = ", user_input)
# DON'T DO THIS!

# ✅ GOOD: Parameterized query
safe_query_by_cyl <- function(con, cyl_value) {
  dbGetQuery(
    con,
    "SELECT * FROM mtcars WHERE cyl = ?",
    params = list(cyl_value)
  )
}

result <- safe_query_by_cyl(con, 4)
nrow(result)
#> [1] 11

dbDisconnect(con)

35.10 Error #3: columns ... have mismatching types

⭐⭐ INTERMEDIATE 🔢 TYPE

35.10.1 The Problem

con <- dbConnect(RSQLite::SQLite(), ":memory:")

# Create table with specific types
dbExecute(con, "CREATE TABLE test (id INTEGER, value REAL)")
#> [1] 0

# Try to insert wrong type
data <- data.frame(id = 1:3, value = c("a", "b", "c"))
dbWriteTable(con, "test", data, append = TRUE)

35.10.2 Solutions

SOLUTION: Match Types

# Check table schema
dbGetQuery(con, "PRAGMA table_info(test)")
#>   cid  name    type notnull dflt_value pk
#> 1   0    id INTEGER       0         NA  0
#> 2   1 value    REAL       0         NA  0

# Convert data to match
data_correct <- data.frame(
  id = as.integer(1:3),
  value = as.numeric(c(1.5, 2.3, 3.7))
)

dbWriteTable(con, "test", data_correct, append = TRUE)

# Verify
dbReadTable(con, "test")
#> Warning: Column `value`: mixed type, first seen values of type string, coercing
#> other values of type real
#>   id value
#> 1  1     a
#> 2  2     b
#> 3  3     c
#> 4  1   1.5
#> 5  2   2.3
#> 6  3   3.7

dbDisconnect(con)

35.11 Transactions

🎯 Best Practice: Use Transactions

con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "accounts", data.frame(id = 1:3, balance = c(100, 200, 150)))

# Transaction ensures all-or-nothing
safe_transfer <- function(con, from_id, to_id, amount) {
  # Start transaction
  dbBegin(con)
  
  tryCatch({
    # Deduct from sender
    dbExecute(con,
      "UPDATE accounts SET balance = balance - ? WHERE id = ?",
      params = list(amount, from_id))
    
    # Add to receiver
    dbExecute(con,
      "UPDATE accounts SET balance = balance + ? WHERE id = ?",
      params = list(amount, to_id))
    
    # Commit if both succeed
    dbCommit(con)
    message("Transfer successful")
    
  }, error = function(e) {
    # Rollback on error
    dbRollback(con)
    stop("Transfer failed: ", e$message)
  })
}

# Before
dbReadTable(con, "accounts")
#>   id balance
#> 1  1     100
#> 2  2     200
#> 3  3     150

# Transfer
safe_transfer(con, from_id = 1, to_id = 2, amount = 50)
#> Transfer successful

# After
dbReadTable(con, "accounts")
#>   id balance
#> 1  1      50
#> 2  2     250
#> 3  3     150

dbDisconnect(con)

35.12 Summary

Key Takeaways:

  1. Use DBI package - Standard interface for all databases
  2. Always disconnect - Use on.exit() for safety
  3. Parameterize queries - Prevent SQL injection
  4. Check table exists - Before querying
  5. Use dplyr - For familiar syntax
  6. Manage connections - Pool for web apps
  7. Use transactions - For multiple operations

Quick Reference:

Function Purpose
dbConnect() Connect to database
dbDisconnect() Close connection
dbListTables() List tables
dbReadTable() Read entire table
dbWriteTable() Write data frame
dbGetQuery() Run SELECT query
dbExecute() Run INSERT/UPDATE/DELETE
dbBegin/Commit/Rollback() Transactions

Database Operations:

# Connect
library(DBI)
con <- dbConnect(RSQLite::SQLite(), "database.db")

# List and inspect
dbListTables(con)
dbListFields(con, "table_name")

# Read data
data <- dbReadTable(con, "table_name")
data <- dbGetQuery(con, "SELECT * FROM table WHERE col = ?", 
                   params = list(value))

# Write data
dbWriteTable(con, "table_name", data_frame)
dbExecute(con, "INSERT INTO table VALUES (?, ?)",
          params = list(val1, val2))

# dplyr interface
library(dplyr)
tbl(con, "table_name") %>%
  filter(col > 10) %>%
  collect()

# Always disconnect
dbDisconnect(con)

Best Practices:

# ✅ Good
Use parameterized queries
Always disconnect (use on.exit)
Check dbIsValid() before queries
Use dplyr for complex operations
Use transactions for multiple operations
Handle connection errors gracefully

# ❌ Avoid
Building SQL with paste() (SQL injection!)
Leaving connections open
Assuming table exists
Not handling connection errors

35.13 Completion

Part XI Complete!

You’ve mastered: - Reading data from various formats - Writing data efficiently - Database connections and queries - SQL integration with R - Best practices for data I/O

Ready for: Part XII (Dates and Times) or other topics!