7.1 Prepare diabetes code dictionary

7.1.1 Define keywords

Define diabetes specific exclusion keywords.

dm_specific_exclusion_keyword_patterns <- "|serum|antibody|remission|relative|association member|high risk of diabetes|suspected diabetes|non-diabetes|breath test|questionnaire|risk score|category score|risk calculator|inhibitor function|C-peptide level|factor binding protein 3|stress test|Insulin tolerance test|Urine screening test|X-ray|NHS Diabetes Prevention|Provision of diabetes clinical summary|diabetes mellitus screen|leaflet given|declined|C1-esterase|pituitary|helicobacter|ineligible|invite|invitation|insulinoma|steroid|secondary diabetes|pre-diabet|prediabet|insipid|provision of written information|not required|national audit|diabetes screen|renal diabetes|non-diabet|^diabetic nurse$|^Diabetic liaison nurse$|jamaica|secondary pancreatic diabetes|driving|neonatal|Addison|PABA test|growth factor|Plasma insulin level|key contact|eligibiliby|CHA2DS2|Professional judgement|Diabetes mellitus: no|non diabetic|information prescription|mother has|preg.|bronzed|Diabetes dietitian|Urine Ketone Test|deleted|refuse|gastropathy|pneumon|Frequency of hypoglycaem|^insulin level$|drug-induced|drug induced|hyperglyceridaemia"

Define diabetes exclusion keywords.

dm_exclusion_keyword_patterns <- paste0(global_exclusion_keyword_patterns,dm_specific_exclusion_keyword_patterns)

Define inclusion keywords.

dm_inclusion_keyword_patterns <- "diabetic|diabetes|diabeto|insulin|hyperglyc|hypoglyc|glycemic control"

7.1.2 Define codes

Define diabetes exclusion codes.

dm_exclusion_codes <- c("ZV653", "C3760", "J4z0", "Y3045", "7L1L2", "Y0015", "X789v", "Y7ITk", "Y2200", "42c..", "42W..", "42WZ.","66Ae.", "66Ae0","66AF.","C1…","XaCET","XaCEU", "XaCEV")
dm_exclusion_code_patterns <- paste(c("^42W","^42c"),collapse = '|')

Import diabetes code lists.

dm_codelist <- fread("raw_data/dm_code_lists/opensafely-diabetes-2020-04-15.csv") %>% 
  full_join(fread("raw_data/dm_code_lists/opensafely-type-1-diabetes-2020-06-29.csv") 
            %>% mutate(Category=1)) %>%
  full_join(fread("raw_data/dm_code_lists/opensafely-type-2-diabetes-2020-06-29.csv") 
            %>% mutate(Category=2)) %>%
  full_join(fread("raw_data/dm_code_lists/opensafely-diabetes-exeter-group-2020-07-06.csv") 
            %>% dplyr::rename(CTV3PreferredTermDesc = ctvterm)) %>%
  dplyr::rename(term_description = CTV3PreferredTermDesc) %>%
  dplyr::rename(code = CTV3ID) %>%
  mutate(Category = as.character(Category)) %>%
  full_join(
    fread("raw_data/dm_code_lists/read_diabetescomplications_caliber.txt") %>%
    full_join(fread("raw_data/dm_code_lists/read_diabetes_expanded_caliber.txt")) %>%
    full_join(fread("raw_data/dm_code_lists/read_diabetes_caliber.txt")) %>%
      dplyr::rename(code = Clinical_code) %>%
      dplyr::rename(term_description = Clinical_term) %>%
      dplyr::rename(Category = `Category_(code)`)
    )

Define inclusion codes.

dm_inclusion_codes <- dm_codelist$code

Define additional diabetes code patterns.

dm_inclusion_code_patterns <- paste(c("^66A","^C10","^F420"),collapse = '|')

7.1.3 Create diabetes code dictionary

Create diabetes code dictionary.

dm_dict <- full_dict %>% 
  filter(grepl(dm_inclusion_keyword_patterns, term_description, ignore.case = T)|
           code %in% dm_inclusion_codes|
           grepl(dm_inclusion_code_patterns, code)) %>%
  filter(!grepl(dm_exclusion_keyword_patterns, term_description, ignore.case = T),
         !(code %in% dm_exclusion_codes),
         !grepl(dm_exclusion_code_patterns, code))

We want to filter outcome-specific dictionaries to only those codes that actually occur in the primary care data, to expedite review of the terms that we include. First, read in the distinct terms in PC data.

terms_actual <- fread("generated_data/entire_gp_clinical_30March2021_formatted.txt", 
                      select = "code") %>%
  distinct()

Now, filter the DM dictionary to terms that exist in the PC data.

dm_dict_actual <- dm_dict %>% 
  filter(code %in% terms_actual$code) 

dm_dict_review <- dm_dict_actual %>% 
  distinct(code, term_description) %>% 
  distinct(code, .keep_all = T)

Now, map the Read v2 terms in the dictionary to CTV3, and vice versa, to make sure we capture equivalent terms. The term mappings are provided by UKB in Resource 592.

map23 <- read_xlsx("raw_data/all_lkps_maps_v3.xlsx", sheet=14) 
map32 <- read_xlsx("raw_data/all_lkps_maps_v3.xlsx", sheet=19) 
read_map <- map32 %>%
  select(READV3_CODE, READV2_CODE, IS_ASSURED) %>%
  filter(IS_ASSURED == 1) %>%
  dplyr::rename(code = READV3_CODE, mapped_code = READV2_CODE) %>%
  mutate(terminology = "read3", mapped_terminology="read2") %>%
  select(-IS_ASSURED) %>%
  rbind(
    map23 %>% select(READV2_CODE, READV3_CODE) %>%
      dplyr::rename(code = READV2_CODE, mapped_code = READV3_CODE) %>%
      mutate(terminology = "read2", mapped_terminology ="read3")
    ) %>%
  distinct() %>% 
  filter(code != mapped_code) %>%
  filter(!grepl("\\.\\.", mapped_code) & !grepl("\\.\\.", code)) %>% #remove very broad mappings
  filter(code %in% terms_actual$code & mapped_code %in% terms_actual$code) #Only keep pairs that exist in gp_clinical

Get any additional mapped codes to include in the DM dictionary.

dm_terms_map <- left_join(dm_dict_actual, read_map) %>%
  filter(!is.na(mapped_code)) %>%
  filter(!(mapped_code %in% dm_dict$code)) %>%
  arrange(code) %>%
  select(-terminology_note) %>%
  left_join(full_dict %>%
              dplyr::rename(mapped_code = code, mapped_description = term_description,
                            mapped_terminology=terminology)) %>%
  group_by(mapped_code) %>%
  slice(1) %>%
  distinct() %>%
  filter(!(grepl("[Dd]rug induced", mapped_description)))

Combine the new terms with the original DM dictionary.

dm_dict_final <- rbind(dm_dict_actual,
                 dm_terms_map %>%
                   select(code = mapped_code, term_description = mapped_description, 
                          terminology = mapped_terminology, terminology_note)
)

Save diabetes code dictionary.

saveRDS(dm_dict_final,"generated_data/dm_dict.RDS")