Debuting with MIMIC 3 and R - Part I: Audit

8 minute read

A friend of mine invited me recently to work on the MIMIC 3 database. For more on mimic, just follow this link.

Most of the people working on that database are medical staff and may have difficulties on beginning their studies.

This article is an exemple of how to handle the database, looking at one effect, the albumine value.

First of all, the first thing to do is familiarise yourself with the database. And for this, nothing is better than mapping the main tables.

Parameter

To query the database, I use the sqldf package. To manipulate the data, the data.table package. To plot the graph, the ggplot2 package.

library(sqldf) # install.packages("sqldf")
library(RPostgreSQL)
library(data.table) # install.packages("data.table")
library(ggplot2) # install.packages("ggplot2")
# connexion:
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname=dbname,host=host,port=port,user=user,password=pwd)

# parametrisation of the sqldf function.
options(sqldf.RPostgreSQL.user = user
        , sqldf.RPostgreSQL.password = pwd
        , sqldf.RPostgreSQL.dbname = dbname
        , sqldf.RPostgreSQL.host = host
        , sqldf.RPostgreSQL.port = port)

Patients

Simple counts.

query <- "select count(*) as nb, count(distinct subject_id) as nb_subject from mimiciii.patients;"
count.patients <- sqldf(query)
count.patients
##      nb nb_subject
## 1 46520      46520

Mapping table:

column.of.interest <- c("column_name", "is_nullable", "data_type")
query <- "SELECT * FROM information_schema.columns WHERE table_schema = 'mimiciii' AND table_name = 'patients' ;"
info.patients <- sqldf(query)
info.patients[, column.of.interest]
##            column_name is_nullable                   data_type
## 1               row_id          NO                     integer
## 2           subject_id          NO                     integer
## 3               gender          NO           character varying
## 4                  dob          NO timestamp without time zone
## 5                  dod         YES timestamp without time zone
## 6             dod_hosp         YES timestamp without time zone
## 7              dod_ssn         YES timestamp without time zone
## 8 hospital_expire_flag          NO           character varying

I look at the top 10 as well.

query <- "select * from mimiciii.patients limit 10;"
sqldf(query)

Admissions

Simple counts.

query <- "select count(*) as nb, count(distinct subject_id) as nb_subject, count(distinct hadm_id) as nb_admissions from mimiciii.admissions;"
count.admission <- sqldf(query)
count.admission
##      nb nb_subject nb_admissions
## 1 58976      46520         58976

Mapping table:

query <- "SELECT * FROM information_schema.columns WHERE table_schema = 'mimiciii' AND table_name = 'admissions' ;"
info.admission <- sqldf(query)
info.admission[, column.of.interest]
##             column_name is_nullable                   data_type
## 1                row_id          NO                     integer
## 2            subject_id          NO                     integer
## 3               hadm_id          NO                     integer
## 4             admittime          NO timestamp without time zone
## 5             dischtime          NO timestamp without time zone
## 6             deathtime         YES timestamp without time zone
## 7        admission_type          NO           character varying
## 8    admission_location          NO           character varying
## 9    discharge_location          NO           character varying
## 10            insurance          NO           character varying
## 11             language         YES           character varying
## 12             religion         YES           character varying
## 13       marital_status         YES           character varying
## 14            ethnicity          NO           character varying
## 15            diagnosis         YES           character varying
## 16    has_ioevents_data          NO                    smallint
## 17 has_chartevents_data          NO                    smallint

ICU stay events

Simple counts.

query <- "select count(*) as nb, count(distinct subject_id) as nb_subject, count(distinct icustay_id) as nb_icu_stay, count(distinct hadm_id) as nb_admission from mimiciii.icustayevents;"
count.icustayevents <- sqldf(query)
count.icustayevents
##      nb nb_subject nb_icu_stay nb_admission
## 1 61532      46476       61532        57786

Mapping table:

query <- "SELECT * FROM information_schema.columns WHERE table_schema = 'mimiciii' AND table_name = 'icustayevents' ;"
info.icustayevents <- sqldf(query)
info.icustayevents[, column.of.interest]
##       column_name is_nullable                   data_type
## 1          row_id          NO                     integer
## 2      subject_id          NO                     integer
## 3         hadm_id          NO                     integer
## 4      icustay_id          NO                     integer
## 5        dbsource          NO           character varying
## 6  first_careunit          NO           character varying
## 7   last_careunit          NO           character varying
## 8    first_wardid          NO                    smallint
## 9     last_wardid          NO                    smallint
## 10         intime          NO timestamp without time zone
## 11        outtime         YES timestamp without time zone
## 12            los         YES            double precision

Services

Simple counts.

query <- "select count(*) as nb, count(distinct subject_id) as nb_subject, count(distinct hadm_id) as nb_admission from mimiciii.services;"
count.services <-sqldf(query)
count.services
##      nb nb_subject nb_admission
## 1 73343      46496        58926

Mapping table:

query <- "SELECT * FROM information_schema.columns WHERE table_schema = 'mimiciii' AND table_name = 'services' ;"
info.services <- sqldf(query)
info.services[, column.of.interest]
##    column_name is_nullable                   data_type
## 1       row_id          NO                     integer
## 2   subject_id          NO                     integer
## 3      hadm_id          NO                     integer
## 4 transfertime          NO timestamp without time zone
## 5 prev_service         YES           character varying
## 6 curr_service         YES           character varying

Transfers

Simple counts.

query <- "select count(*) as nb, count(distinct subject_id) as nb_subject, count(distinct hadm_id) as nb_admission, count(distinct icustay_id) as nb_icustay from mimiciii.transfers;"
count.transfers <- sqldf(query)
count.transfers
##       nb nb_subject nb_admission nb_icustay
## 1 261897      46520        58976      61532

Mapping table:

query <- "SELECT * FROM information_schema.columns WHERE table_schema = 'mimiciii' AND table_name = 'transfers' ;"
info.transfers <- sqldf(query)
info.transfers[, column.of.interest]
##      column_name is_nullable                   data_type
## 1         row_id          NO                     integer
## 2     subject_id          NO                     integer
## 3        hadm_id          NO                     integer
## 4     icustay_id         YES                     integer
## 5       dbsource          NO           character varying
## 6      eventtype         YES           character varying
## 7  prev_careunit         YES           character varying
## 8  curr_careunit         YES           character varying
## 9    prev_wardid         YES                    smallint
## 10   curr_wardid         YES                    smallint
## 11        intime         YES timestamp without time zone
## 12       outtime         YES timestamp without time zone
## 13           los         YES            double precision

Tags: ,

Categories:

Updated:

Leave a Comment