@article{rongali_learning_2020,
title = {Learning {Latent} {Space} {Representations} to {Predict} {Patient} {Outcomes}: {Model} {Development} and {Validation}},
volume = {22},
shorttitle = {Learning {Latent} {Space} {Representations} to {Predict} {Patient} {Outcomes}},
url = {https://www.jmir.org/2020/3/e16374/},
doi = {10.2196/16374},
abstract = {Background: Scalable and accurate health outcome prediction using electronic health record (EHR) data has gained much attention in research recently. Previous machine learning models mostly ignore relations between different types of clinical data (ie, laboratory components, International Classification of Diseases codes, and medications).
Objective: This study aimed to model such relations and build predictive models using the EHR data from intensive care units. We developed innovative neural network models and compared them with the widely used logistic regression model and other state-of-the-art neural network models to predict the patient’s mortality using their longitudinal EHR data.
Methods: We built a set of neural network models that we collectively called as long short-term memory (LSTM) outcome prediction using comprehensive feature relations or in short, CLOUT. Our CLOUT models use a correlational neural network model to identify a latent space representation between different types of discrete clinical features during a patient’s encounter and integrate the latent representation into an LSTM-based predictive model framework. In addition, we designed an ablation experiment to identify risk factors from our CLOUT models. Using physicians’ input as the gold standard, we compared the risk factors identified by both CLOUT and logistic regression models.
Results: Experiments on the Medical Information Mart for Intensive Care-III dataset (selected patient population: 7537) show that CLOUT (area under the receiver operating characteristic curve=0.89) has surpassed logistic regression (0.82) and other baseline NN models (\&lt;0.86). In addition, physicians’ agreement with the CLOUT-derived risk factor rankings was statistically significantly higher than the agreement with the logistic regression model.
Conclusions: Our results support the applicability of CLOUT for real-world clinical use in identifying patients at high risk of mortality.
Trial Registration:
[J Med Internet Res 2020;22(3):e16374]},
language = {en},
number = {3},
urldate = {2020-04-07},
journal = {Journal of Medical Internet Research},
author = {Rongali, Subendhu and Rose, Adam J. and McManus, David D. and Bajracharya, Adarsha S. and Kapoor, Alok and Granillo, Edgard and Yu, Hong},
year = {2020},
pmid = {32202503 PMCID: PMC7136840},
note = {Company: Journal of Medical Internet
Research
Distributor: Journal of Medical Internet Research
Institution: Journal of Medical Internet Research
Label: Journal of Medical Internet Research
Publisher: JMIR Publications Inc., Toronto, Canada},
pages = {e16374}
}

Background: Scalable and accurate health outcome prediction using electronic health record (EHR) data has gained much attention in research recently. Previous machine learning models mostly ignore relations between different types of clinical data (ie, laboratory components, International Classification of Diseases codes, and medications). Objective: This study aimed to model such relations and build predictive models using the EHR data from intensive care units. We developed innovative neural network models and compared them with the widely used logistic regression model and other state-of-the-art neural network models to predict the patient’s mortality using their longitudinal EHR data. Methods: We built a set of neural network models that we collectively called as long short-term memory (LSTM) outcome prediction using comprehensive feature relations or in short, CLOUT. Our CLOUT models use a correlational neural network model to identify a latent space representation between different types of discrete clinical features during a patient’s encounter and integrate the latent representation into an LSTM-based predictive model framework. In addition, we designed an ablation experiment to identify risk factors from our CLOUT models. Using physicians’ input as the gold standard, we compared the risk factors identified by both CLOUT and logistic regression models. Results: Experiments on the Medical Information Mart for Intensive Care-III dataset (selected patient population: 7537) show that CLOUT (area under the receiver operating characteristic curve=0.89) has surpassed logistic regression (0.82) and other baseline NN models (<0.86). In addition, physicians’ agreement with the CLOUT-derived risk factor rankings was statistically significantly higher than the agreement with the logistic regression model. Conclusions: Our results support the applicability of CLOUT for real-world clinical use in identifying patients at high risk of mortality. Trial Registration: [J Med Internet Res 2020;22(3):e16374]

@inproceedings{jagannatha_calibrating_2020,
title = {Calibrating {Structured} {Output} {Predictors} for {Natural} {Language} {Processing}.},
abstract = {We address the problem of calibrating prediction confidence for output entities of interest in natural language processing (NLP) applications. It is important that NLP applications such as named entity recognition and question answering produce calibrated confidence scores for their predictions, especially if the system is to be deployed in a safety-critical domain such as healthcare. However, the output space of such structured prediction models is often too large to adapt binary or multi-class calibration methods directly. In this study, we propose a general calibration scheme for output entities of interest in neural-network based structured prediction models. Our proposed method can be used with any binary class calibration scheme and a neural network model. Additionally, we show that our calibration method can also be used as an uncertainty-aware, entity-specific decoding step to improve the performance of the underlying model at no additional training cost or data requirements. We show that our method outperforms current calibration techniques for named-entity-recognition, part-of-speech and question answering. We also improve our model's performance from our decoding step across several tasks and benchmark datasets. Our method improves the calibration and model performance on out-of-domain test scenarios as well.},
booktitle = {2020 {Annual} {Conference} of the {Association} for {Computational} {Linguistics} ({ACL})},
author = {Jagannatha, Abhyuday and Yu, Hong},
year = {2020}
}

We address the problem of calibrating prediction confidence for output entities of interest in natural language processing (NLP) applications. It is important that NLP applications such as named entity recognition and question answering produce calibrated confidence scores for their predictions, especially if the system is to be deployed in a safety-critical domain such as healthcare. However, the output space of such structured prediction models is often too large to adapt binary or multi-class calibration methods directly. In this study, we propose a general calibration scheme for output entities of interest in neural-network based structured prediction models. Our proposed method can be used with any binary class calibration scheme and a neural network model. Additionally, we show that our calibration method can also be used as an uncertainty-aware, entity-specific decoding step to improve the performance of the underlying model at no additional training cost or data requirements. We show that our method outperforms current calibration techniques for named-entity-recognition, part-of-speech and question answering. We also improve our model's performance from our decoding step across several tasks and benchmark datasets. Our method improves the calibration and model performance on out-of-domain test scenarios as well.

Background: Since its inception, artificial intelligence has aimed to use computers to help make clinical diagnoses. Evidence-based medical reasoning is important for patient care. Inferring clinical diagnoses is a crucial step during the patient encounter. Previous works mainly used expert systems or machine learning–based methods to predict the International Classification of Diseases - Clinical Modification codes based on electronic health records. We report an alternative approach: inference of clinical diagnoses from patients’ reported symptoms and physicians’ clinical observations. Objective: We aimed to report a natural language processing system for generating medical assessments based on patient information described in the electronic health record (EHR) notes. Methods: We processed EHR notes into the Subjective, Objective, Assessment, and Plan sections. We trained a neural network model for medical assessment generation (N2MAG). Our N2MAG is an innovative deep neural model that uses the Subjective and Objective sections of an EHR note to automatically generate an “expert-like” assessment of the patient. N2MAG can be trained in an end-to-end fashion and does not require feature engineering and external knowledge resources. Results: We evaluated N2MAG and the baseline models both quantitatively and qualitatively. Evaluated by both the Recall-Oriented Understudy for Gisting Evaluation metrics and domain experts, our results show that N2MAG outperformed the existing state-of-the-art baseline models. Conclusions: N2MAG could generate a medical assessment from the Subject and Objective section descriptions in EHR notes. Future work will assess its potential for providing clinical decision support. [JMIR Med Inform 2020;8(1):e14971]

@article{chen_detecting_2019,
title = {Detecting {Hypoglycemia} {Incidents} {Reported} in {Patients}’ {Secure} {Messages}: {Using} {Cost}-{Sensitive} {Learning} and {Oversampling} to {Reduce} {Data} {Imbalance}},
volume = {21},
issn = {1439-4456},
shorttitle = {Detecting {Hypoglycemia} {Incidents} {Reported} in {Patients}’ {Secure} {Messages}},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6431826/},
doi = {10.2196/11990},
abstract = {Background
Improper dosing of medications such as insulin can cause hypoglycemic episodes, which may lead to severe morbidity or even death. Although secure messaging was designed for exchanging nonurgent messages, patients sometimes report hypoglycemia events through secure messaging. Detecting these patient-reported adverse events may help alert clinical teams and enable early corrective actions to improve patient safety.
Objective
We aimed to develop a natural language processing system, called HypoDetect (Hypoglycemia Detector), to automatically identify hypoglycemia incidents reported in patients’ secure messages.
Methods
An expert in public health annotated 3000 secure message threads between patients with diabetes and US Department of Veterans Affairs clinical teams as containing patient-reported hypoglycemia incidents or not. A physician independently annotated 100 threads randomly selected from this dataset to determine interannotator agreement. We used this dataset to develop and evaluate HypoDetect. HypoDetect incorporates 3 machine learning algorithms widely used for text classification: linear support vector machines, random forest, and logistic regression. We explored different learning features, including new knowledge-driven features. Because only 114 (3.80\%) messages were annotated as positive, we investigated cost-sensitive learning and oversampling methods to mitigate the challenge of imbalanced data.
Results
The interannotator agreement was Cohen kappa=.976. Using cross-validation, logistic regression with cost-sensitive learning achieved the best performance (area under the receiver operating characteristic curve=0.954, sensitivity=0.693, specificity 0.974, F1 score=0.590). Cost-sensitive learning and the ensembled synthetic minority oversampling technique improved the sensitivity of the baseline systems substantially (by 0.123 to 0.728 absolute gains). Our results show that a variety of features contributed to the best performance of HypoDetect.
Conclusions
Despite the challenge of data imbalance, HypoDetect achieved promising results for the task of detecting hypoglycemia incidents from secure messages. The system has a great potential to facilitate early detection and treatment of hypoglycemia.},
number = {3},
urldate = {2019-12-29},
journal = {Journal of Medical Internet Research},
author = {Chen, Jinying and Lalor, John and Liu, Weisong and Druhl, Emily and Granillo, Edgard and Vimalananda, Varsha G and Yu, Hong},
month = mar,
year = {2019},
pmid = {30855231 PMCID: PMC6431826}
}

Background Improper dosing of medications such as insulin can cause hypoglycemic episodes, which may lead to severe morbidity or even death. Although secure messaging was designed for exchanging nonurgent messages, patients sometimes report hypoglycemia events through secure messaging. Detecting these patient-reported adverse events may help alert clinical teams and enable early corrective actions to improve patient safety. Objective We aimed to develop a natural language processing system, called HypoDetect (Hypoglycemia Detector), to automatically identify hypoglycemia incidents reported in patients’ secure messages. Methods An expert in public health annotated 3000 secure message threads between patients with diabetes and US Department of Veterans Affairs clinical teams as containing patient-reported hypoglycemia incidents or not. A physician independently annotated 100 threads randomly selected from this dataset to determine interannotator agreement. We used this dataset to develop and evaluate HypoDetect. HypoDetect incorporates 3 machine learning algorithms widely used for text classification: linear support vector machines, random forest, and logistic regression. We explored different learning features, including new knowledge-driven features. Because only 114 (3.80%) messages were annotated as positive, we investigated cost-sensitive learning and oversampling methods to mitigate the challenge of imbalanced data. Results The interannotator agreement was Cohen kappa=.976. Using cross-validation, logistic regression with cost-sensitive learning achieved the best performance (area under the receiver operating characteristic curve=0.954, sensitivity=0.693, specificity 0.974, F1 score=0.590). Cost-sensitive learning and the ensembled synthetic minority oversampling technique improved the sensitivity of the baseline systems substantially (by 0.123 to 0.728 absolute gains). Our results show that a variety of features contributed to the best performance of HypoDetect. Conclusions Despite the challenge of data imbalance, HypoDetect achieved promising results for the task of detecting hypoglycemia incidents from secure messages. The system has a great potential to facilitate early detection and treatment of hypoglycemia.

Background: Hypoglycemic events are common and potentially dangerous conditions among patients being treated for diabetes. Automatic detection of such events could improve patient care and is valuable in population studies. Electronic health records (EHRs) are valuable resources for the detection of such events. Objective: In this study, we aim to develop a deep-learning–based natural language processing (NLP) system to automatically detect hypoglycemic events from EHR notes. Our model is called the High-Performing System for Automatically Detecting Hypoglycemic Events (HYPE). Methods: Domain experts reviewed 500 EHR notes of diabetes patients to determine whether each sentence contained a hypoglycemic event or not. We used this annotated corpus to train and evaluate HYPE, the high-performance NLP system for hypoglycemia detection. We built and evaluated both a classical machine learning model (ie, support vector machines [SVMs]) and state-of-the-art neural network models. Results: We found that neural network models outperformed the SVM model. The convolutional neural network (CNN) model yielded the highest performance in a 10-fold cross-validation setting: mean precision=0.96 (SD 0.03), mean recall=0.86 (SD 0.03), and mean F1=0.91 (SD 0.03). Conclusions: Despite the challenges posed by small and highly imbalanced data, our CNN-based HYPE system still achieved a high performance for hypoglycemia detection. HYPE can be used for EHR-based hypoglycemia surveillance and population studies in diabetes patients. [JMIR Med Inform 2019;7(4):e14340]

@article{jagannatha_overview_2019,
title = {Overview of the {First} {Natural} {Language} {Processing} {Challenge} for {Extracting} {Medication}, {Indication}, and {Adverse} {Drug} {Events} from {Electronic} {Health} {Record} {Notes} ({MADE} 1.0)},
issn = {1179-1942},
doi = {10.1007/s40264-018-0762-z},
abstract = {INTRODUCTION: This work describes the Medication and Adverse Drug Events from Electronic Health Records (MADE 1.0) corpus and provides an overview of the MADE 1.0 2018 challenge for extracting medication, indication, and adverse drug events (ADEs) from electronic health record (EHR) notes.
OBJECTIVE: The goal of MADE is to provide a set of common evaluation tasks to assess the state of the art for natural language processing (NLP) systems applied to EHRs supporting drug safety surveillance and pharmacovigilance. We also provide benchmarks on the MADE dataset using the system submissions received in the MADE 2018 challenge.
METHODS: The MADE 1.0 challenge has released an expert-annotated cohort of medication and ADE information comprising 1089 fully de-identified longitudinal EHR notes from 21 randomly selected patients with cancer at the University of Massachusetts Memorial Hospital. Using this cohort as a benchmark, the MADE 1.0 challenge designed three shared NLP tasks. The named entity recognition (NER) task identifies medications and their attributes (dosage, route, duration, and frequency), indications, ADEs, and severity. The relation identification (RI) task identifies relations between the named entities: medication-indication, medication-ADE, and attribute relations. The third shared task (NER-RI) evaluates NLP models that perform the NER and RI tasks jointly. In total, 11 teams from four countries participated in at least one of the three shared tasks, and 41 system submissions were received in total.
RESULTS: The best systems F1 scores for NER, RI, and NER-RI were 0.82, 0.86, and 0.61, respectively. Ensemble classifiers using the team submissions improved the performance further, with an F1 score of 0.85, 0.87, and 0.66 for the three tasks, respectively.
CONCLUSION: MADE results show that recent progress in NLP has led to remarkable improvements in NER and RI tasks for the clinical domain. However, some room for improvement remains, particularly in the NER-RI task.},
language = {eng},
number = {1},
journal = {Drug Safety},
author = {Jagannatha, Abhyuday and Liu, Feifan and Liu, Weisong and Yu, Hong},
month = jan,
year = {2019},
pmid = {30649735 PMCID: PMC6860017},
pages = {99--111}
}

INTRODUCTION: This work describes the Medication and Adverse Drug Events from Electronic Health Records (MADE 1.0) corpus and provides an overview of the MADE 1.0 2018 challenge for extracting medication, indication, and adverse drug events (ADEs) from electronic health record (EHR) notes. OBJECTIVE: The goal of MADE is to provide a set of common evaluation tasks to assess the state of the art for natural language processing (NLP) systems applied to EHRs supporting drug safety surveillance and pharmacovigilance. We also provide benchmarks on the MADE dataset using the system submissions received in the MADE 2018 challenge. METHODS: The MADE 1.0 challenge has released an expert-annotated cohort of medication and ADE information comprising 1089 fully de-identified longitudinal EHR notes from 21 randomly selected patients with cancer at the University of Massachusetts Memorial Hospital. Using this cohort as a benchmark, the MADE 1.0 challenge designed three shared NLP tasks. The named entity recognition (NER) task identifies medications and their attributes (dosage, route, duration, and frequency), indications, ADEs, and severity. The relation identification (RI) task identifies relations between the named entities: medication-indication, medication-ADE, and attribute relations. The third shared task (NER-RI) evaluates NLP models that perform the NER and RI tasks jointly. In total, 11 teams from four countries participated in at least one of the three shared tasks, and 41 system submissions were received in total. RESULTS: The best systems F1 scores for NER, RI, and NER-RI were 0.82, 0.86, and 0.61, respectively. Ensemble classifiers using the team submissions improved the performance further, with an F1 score of 0.85, 0.87, and 0.66 for the three tasks, respectively. CONCLUSION: MADE results show that recent progress in NLP has led to remarkable improvements in NER and RI tasks for the clinical domain. However, some room for improvement remains, particularly in the NER-RI task.

@article{rawat_naranjo_2019,
title = {Naranjo {Question} {Answering} using {End}-to-{End} {Multi}-task {Learning} {Model}},
doi = {10.1145/3292500.3330770},
abstract = {In the clinical domain, it is important to understand whether an adverse drug reaction (ADR) is caused by a particular medication. Clinical judgement studies help judge the causal relation between a medication and its ADRs. In this study, we present the first attempt to automatically infer the causality between a drug and an ADR from electronic health records (EHRs) by answering the Naranjo questionnaire, the validated clinical question answering set used by domain experts for ADR causality assessment. Using physicians’ annotation as the gold standard, our proposed joint model, which uses multi-task learning to predict the answers of a subset of the Naranjo questionnaire, significantly outperforms the baseline pipeline model with a good margin, achieving a macro-weighted f-score between 0.3652 – 0.5271 and micro-weighted f-score between 0.9523 – 0.9918.},
journal = {25th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)},
author = {Rawat, Bhanu P and Li, Fei and Yu, Hong},
year = {2019},
pmid = {31799022 NIHMSID: NIHMS1058295 PMCID:PMC6887102},
pages = {2547--2555}
}

In the clinical domain, it is important to understand whether an adverse drug reaction (ADR) is caused by a particular medication. Clinical judgement studies help judge the causal relation between a medication and its ADRs. In this study, we present the first attempt to automatically infer the causality between a drug and an ADR from electronic health records (EHRs) by answering the Naranjo questionnaire, the validated clinical question answering set used by domain experts for ADR causality assessment. Using physicians’ annotation as the gold standard, our proposed joint model, which uses multi-task learning to predict the answers of a subset of the Naranjo questionnaire, significantly outperforms the baseline pipeline model with a good margin, achieving a macro-weighted f-score between 0.3652 – 0.5271 and micro-weighted f-score between 0.9523 – 0.9918.

@article{rose_anticoagulant_2019,
title = {Anticoagulant prescribing for non-valvular atrial fibrillation in the {Veterans} {Health} {Administration}},
doi = {10.1161/JAHA.119.012646},
abstract = {Background Direct acting oral anticoagulants (DOACs) theoretically could contribute to addressing underuse of anticoagulation in non-valvular atrial fibrillation (NVAF). Few studies have examined this prospect, however. The potential of DOACs to address underuse of anticoagulation in NVAF could be magnified within a healthcare system that sharply limits patients' exposure to out-of-pocket copayments, such as the Veterans Health Administration (VA). Methods and Results We used a clinical data set of all patients with NVAF treated within VA from 2007 to 2016 (n=987 373). We examined how the proportion of patients receiving any anticoagulation, and which agent was prescribed, changed over time. When first approved for VA use in 2011, DOACs constituted a tiny proportion of all prescriptions for anticoagulants (2\%); by 2016, this proportion had increased to 45\% of all prescriptions and 67\% of new prescriptions. Patient characteristics associated with receiving a DOAC, rather than warfarin, included white race, better kidney function, fewer comorbid conditions overall, and no history of stroke or bleeding. In 2007, before the introduction of DOACs, 56\% of VA patients with NVAF were receiving anticoagulation; this dipped to 44\% in 2012 just after the introduction of DOACs and had risen back to 51\% by 2016. Conclusions These results do not suggest that the availability of DOACs has led to an increased proportion of patients with NVAF receiving anticoagulation, even in the context of a healthcare system that sharply limits patients' exposure to out-of-pocket copayments.},
journal = {Journal of the American Heart Association},
author = {Rose, AJ and Goldberg, R and McManus, DD and Kapoor, A and Wang, V and Liu, W and Yu, H},
year = {2019},
pmid = {31441364 PMCID:PMC6755851}
}

Background Direct acting oral anticoagulants (DOACs) theoretically could contribute to addressing underuse of anticoagulation in non-valvular atrial fibrillation (NVAF). Few studies have examined this prospect, however. The potential of DOACs to address underuse of anticoagulation in NVAF could be magnified within a healthcare system that sharply limits patients' exposure to out-of-pocket copayments, such as the Veterans Health Administration (VA). Methods and Results We used a clinical data set of all patients with NVAF treated within VA from 2007 to 2016 (n=987 373). We examined how the proportion of patients receiving any anticoagulation, and which agent was prescribed, changed over time. When first approved for VA use in 2011, DOACs constituted a tiny proportion of all prescriptions for anticoagulants (2%); by 2016, this proportion had increased to 45% of all prescriptions and 67% of new prescriptions. Patient characteristics associated with receiving a DOAC, rather than warfarin, included white race, better kidney function, fewer comorbid conditions overall, and no history of stroke or bleeding. In 2007, before the introduction of DOACs, 56% of VA patients with NVAF were receiving anticoagulation; this dipped to 44% in 2012 just after the introduction of DOACs and had risen back to 51% by 2016. Conclusions These results do not suggest that the availability of DOACs has led to an increased proportion of patients with NVAF receiving anticoagulation, even in the context of a healthcare system that sharply limits patients' exposure to out-of-pocket copayments.

Incorporating Item Response Theory (IRT) into NLP tasks can provide valuable information about model performance and behavior. Traditionally, IRT models are learned using human response pattern (RP) data, presenting a significant bottleneck for large data sets like those required for training deep neural networks (DNNs). In this work we propose learning IRT models using RPs generated from artificial crowds of DNN models. We demonstrate the effectiveness of learning IRT models using DNN-generated data through quantitative and qualitative analyses for two NLP tasks. Parameters learned from human and machine RPs for natural language inference and sentiment analysis exhibit medium to large positive correlations. We demonstrate a use-case for latent difficulty item parameters, namely training set filtering, and show that using difficulty to sample training data outperforms baseline methods. Finally, we highlight cases where human expectation about item difficulty does not match difficulty as estimated from the machine RPs.

@article{zheng_quiklite_2019,
title = {{QuikLitE}, a {Framework} for {Quick} {Literacy} {Evaluation} in {Medicine}: {Development} and {Validation}},
volume = {21},
copyright = {Unless stated otherwise, all articles are open-access distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work (},
shorttitle = {{QuikLitE}, a {Framework} for {Quick} {Literacy} {Evaluation} in {Medicine}},
url = {https://www.jmir.org/2019/2/e12525/},
doi = {10.2196/jmir.12525},
abstract = {Background: A plethora of health literacy instruments was developed over the decades. They usually start with experts curating passages of text or word lists, followed by psychometric validation and revision based on test results obtained from a sample population. This process is costly and it is difficult to customize for new usage scenarios. Objective: This study aimed to develop and evaluate a framework for dynamically creating test instruments that can provide a focused assessment of patients’ health literacy. Methods: A health literacy framework and scoring method were extended from the vocabulary knowledge test to accommodate a wide range of item difficulties and various degrees of uncertainty in the participant’s answer. Web-based tests from Amazon Mechanical Turk users were used to assess reliability and validity. Results: Parallel forms of our tests showed high reliability (correlation=.78; 95\% CI 0.69-0.85). Validity measured as correlation with an electronic health record comprehension instrument was higher (.47-.61 among 3 groups) than 2 existing tools (Short Assessment of Health Literacy-English, .38-.43; Short Test of Functional Health Literacy in Adults, .34-.46). Our framework is able to distinguish higher literacy levels that are often not measured by other instruments. It is also flexible, allowing customizations to the test the designer’s focus on a particular interest in a subject matter or domain. The framework is among the fastest health literacy instrument to administer. Conclusions: We proposed a valid and highly reliable framework to dynamically create health literacy instruments, alleviating the need to repeat a time-consuming process when a new use scenario arises. This framework can be customized to a specific need on demand and can measure skills beyond the basic level. [J Med Internet Res 2019;21(2):e12525]},
language = {en},
number = {2},
urldate = {2019-02-22},
journal = {Journal of Medical Internet Research},
author = {Zheng, Jiaping and Yu, Hong},
year = {2019},
pmid = {30794206 PMCID: 6406229},
pages = {e12525}
}

Background: A plethora of health literacy instruments was developed over the decades. They usually start with experts curating passages of text or word lists, followed by psychometric validation and revision based on test results obtained from a sample population. This process is costly and it is difficult to customize for new usage scenarios. Objective: This study aimed to develop and evaluate a framework for dynamically creating test instruments that can provide a focused assessment of patients’ health literacy. Methods: A health literacy framework and scoring method were extended from the vocabulary knowledge test to accommodate a wide range of item difficulties and various degrees of uncertainty in the participant’s answer. Web-based tests from Amazon Mechanical Turk users were used to assess reliability and validity. Results: Parallel forms of our tests showed high reliability (correlation=.78; 95% CI 0.69-0.85). Validity measured as correlation with an electronic health record comprehension instrument was higher (.47-.61 among 3 groups) than 2 existing tools (Short Assessment of Health Literacy-English, .38-.43; Short Test of Functional Health Literacy in Adults, .34-.46). Our framework is able to distinguish higher literacy levels that are often not measured by other instruments. It is also flexible, allowing customizations to the test the designer’s focus on a particular interest in a subject matter or domain. The framework is among the fastest health literacy instrument to administer. Conclusions: We proposed a valid and highly reliable framework to dynamically create health literacy instruments, alleviating the need to repeat a time-consuming process when a new use scenario arises. This framework can be customized to a specific need on demand and can measure skills beyond the basic level. [J Med Internet Res 2019;21(2):e12525]

@article{lalor_improving_2019,
title = {Improving {Electronic} {Health} {Record} {Note} {Comprehension} {With} {NoteAid}: {Randomized} {Trial} of {Electronic} {Health} {Record} {Note} {Comprehension} {Interventions} {With} {Crowdsourced} {Workers}},
volume = {21},
copyright = {Unless stated otherwise, all articles are open-access distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work (},
shorttitle = {Improving {Electronic} {Health} {Record} {Note} {Comprehension} {With} {NoteAid}},
url = {https://www.jmir.org/2019/1/e10793/},
doi = {10.2196/jmir.10793},
abstract = {Background: Patient portals are becoming more common, and with them, the ability of patients to access their personal electronic health records (EHRs). EHRs, in particular the free-text EHR notes, often contain medical jargon and terms that are difficult for laypersons to understand. There are many Web-based resources for learning more about particular diseases or conditions, including systems that directly link to lay definitions or educational materials for medical concepts. Objective: Our goal is to determine whether use of one such tool, NoteAid, leads to higher EHR note comprehension ability. We use a new EHR note comprehension assessment tool instead of patient self-reported scores. Methods: In this work, we compare a passive, self-service educational resource (MedlinePlus) with an active resource (NoteAid) where definitions are provided to the user for medical concepts that the system identifies. We use Amazon Mechanical Turk (AMT) to recruit individuals to complete ComprehENotes, a new test of EHR note comprehension. Results: Mean scores for individuals with access to NoteAid are significantly higher than the mean baseline scores, both for raw scores (P=.008) and estimated ability (P=.02). Conclusions: In our experiments, we show that the active intervention leads to significantly higher scores on the comprehension test as compared with a baseline group with no resources provided. In contrast, there is no significant difference between the group that was provided with the passive intervention and the baseline group. Finally, we analyze the demographics of the individuals who participated in our AMT task and show differences between groups that align with the current understanding of health literacy between populations. This is the first work to show improvements in comprehension using tools such as NoteAid as measured by an EHR note comprehension assessment tool as opposed to patient self-reported scores. [J Med Internet Res 2019;21(1):e10793]},
language = {en},
number = {1},
urldate = {2019-01-31},
journal = {Journal of Medical Internet Research},
author = {Lalor, John P. and Woolf, Beverly and Yu, Hong},
year = {2019},
pmid = {30664453 PMCID: 6351990},
pages = {e10793}
}

Background: Patient portals are becoming more common, and with them, the ability of patients to access their personal electronic health records (EHRs). EHRs, in particular the free-text EHR notes, often contain medical jargon and terms that are difficult for laypersons to understand. There are many Web-based resources for learning more about particular diseases or conditions, including systems that directly link to lay definitions or educational materials for medical concepts. Objective: Our goal is to determine whether use of one such tool, NoteAid, leads to higher EHR note comprehension ability. We use a new EHR note comprehension assessment tool instead of patient self-reported scores. Methods: In this work, we compare a passive, self-service educational resource (MedlinePlus) with an active resource (NoteAid) where definitions are provided to the user for medical concepts that the system identifies. We use Amazon Mechanical Turk (AMT) to recruit individuals to complete ComprehENotes, a new test of EHR note comprehension. Results: Mean scores for individuals with access to NoteAid are significantly higher than the mean baseline scores, both for raw scores (P=.008) and estimated ability (P=.02). Conclusions: In our experiments, we show that the active intervention leads to significantly higher scores on the comprehension test as compared with a baseline group with no resources provided. In contrast, there is no significant difference between the group that was provided with the passive intervention and the baseline group. Finally, we analyze the demographics of the individuals who participated in our AMT task and show differences between groups that align with the current understanding of health literacy between populations. This is the first work to show improvements in comprehension using tools such as NoteAid as measured by an EHR note comprehension assessment tool as opposed to patient self-reported scores. [J Med Internet Res 2019;21(1):e10793]

@inproceedings{yang_generating_2019,
address = {Hong Kong, China},
title = {Generating {Classical} {Chinese} {Poems} from {Vernacular} {Chinese}},
url = {https://www.aclweb.org/anthology/D19-1637},
doi = {10.18653/v1/D19-1637},
abstract = {Classical Chinese poetry is a jewel in the treasure house of Chinese culture. Previous poem generation models only allow users to employ keywords to interfere the meaning of generated poems, leaving the dominion of generation to the model. In this paper, we propose a novel task of generating classical Chinese poems from vernacular, which allows users to have more control over the semantic of generated poems. We adapt the approach of unsupervised machine translation (UMT) to our task. We use segmentation-based padding and reinforcement learning to address under-translation and over-translation respectively. According to experiments, our approach significantly improve the perplexity and BLEU compared with typical UMT models. Furthermore, we explored guidelines on how to write the input vernacular to generate better poems. Human evaluation showed our approach can generate high-quality poems which are comparable to amateur poems.},
urldate = {2019-11-11},
booktitle = {Proceedings of the 2019 {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing} and the 9th {International} {Joint} {Conference} on {Natural} {Language} {Processing} ({EMNLP}-{IJCNLP})},
publisher = {Association for Computational Linguistics},
author = {Yang, Zhichao and Cai, Pengshan and Feng, Yansong and Li, Fei and Feng, Weijiang and Chiu, Elena Suet-Ying and yu, hong},
month = nov,
year = {2019},
pages = {6156--6165}
}

Classical Chinese poetry is a jewel in the treasure house of Chinese culture. Previous poem generation models only allow users to employ keywords to interfere the meaning of generated poems, leaving the dominion of generation to the model. In this paper, we propose a novel task of generating classical Chinese poems from vernacular, which allows users to have more control over the semantic of generated poems. We adapt the approach of unsupervised machine translation (UMT) to our task. We use segmentation-based padding and reinforcement learning to address under-translation and over-translation respectively. According to experiments, our approach significantly improve the perplexity and BLEU compared with typical UMT models. Furthermore, we explored guidelines on how to write the input vernacular to generate better poems. Human evaluation showed our approach can generate high-quality poems which are comparable to amateur poems.

@patent{yu_method_2019,
title = {Method for {Meta}-{Level} {Continual} {Learning}},
url = {https://patents.google.com/patent/US20190034798A1/en},
abstract = {Classification of an input task data set by meta level continual learning includes analyzing first and second training data sets in a task space to generate first and second meta weights and a slow weight value, and comparing an input task data set to the slow weight to generate a fast weight. The first and second meta weights are parameterized with the fast weight value to update the slow weight value, whereby a value is associated with the input task data set, thereby classifying the input task data set by meta level continual learning.},
nationality = {US},
assignee = {University Of Massachusetts Medical School},
number = {US20190034798A1},
urldate = {2019-04-10},
author = {Yu, Hong and Munkhdalai, Tsendsuren},
month = jan,
year = {2019},
keywords = {loss, meta, slow, task, weight}
}

Classification of an input task data set by meta level continual learning includes analyzing first and second training data sets in a task space to generate first and second meta weights and a slow weight value, and comparing an input task data set to the slow weight to generate a fast weight. The first and second meta weights are parameterized with the fast weight value to update the slow weight value, whereby a value is associated with the input task data set, thereby classifying the input task data set by meta level continual learning.

Electronic health records (EHR) capture “real-world” disease and care processes and hence offer richer and more generalizable data for comparative effectiveness research than traditional randomized clinical trial studies. With the increasingly broadening adoption of EHR worldwide, there is a growing need to widen the use of EHR data to support clinical research. A big barrier to this goal is that much of the information in EHR is still narrative. This chapter describes the foundation of biomedical language processing and explains how traditional machine learning and the state-of-the-art deep learning techniques can be employed in the context of extracting and transforming narrative information in EHR to support clinical research.

@article{pradhan_automatic_2019,
title = {Automatic extraction of quantitative data from {ClinicalTrials}.gov to conduct meta-analyses},
volume = {105},
issn = {1878-5921},
doi = {10.1016/j.jclinepi.2018.08.023},
abstract = {OBJECTIVES: Systematic reviews and meta-analyses are labor-intensive and time-consuming. Automated extraction of quantitative data from primary studies can accelerate this process. ClinicalTrials.gov, launched in 2000, is the world's largest trial repository of results data from clinical trials; it has been used as a source instead of journal articles. We have developed a Web application called EXACT (EXtracting Accurate efficacy and safety information from ClinicalTrials.gov) that allows users without advanced programming skills to automatically extract data from ClinicalTrials.gov in analysis-ready format. We have also used the automatically extracted data to examine the reproducibility of meta-analyses in three published systematic reviews.
STUDY DESIGN AND SETTING: We developed a Python-based software application (EXACT) that automatically extracts data required for meta-analysis from the ClinicalTrials.gov database in a spreadsheet format. We confirmed the accuracy of the extracted data and then used those data to repeat meta-analyses in three published systematic reviews. To ensure that we used the same statistical methods and outcomes as the published systematic reviews, we repeated the meta-analyses using data manually extracted from the relevant journal articles. For the outcomes whose results we were able to reproduce using those journal article data, we examined the usability of ClinicalTrials.gov data.
RESULTS: EXACT extracted data at ClincalTrials.gov with 100\% accuracy, and it required 60\% less time than the usual practice of manually extracting data from journal articles. We found that 87\% of the data elements extracted using EXACT matched those extracted manually from the journal articles. We were able to reproduce 24 of 28 outcomes using the journal article data. Of these 24 outcomes, we were able to reproduce 83.3\% of the published estimates using data at ClinicalTrials.gov.
CONCLUSION: EXACT (http://bio-nlp.org/EXACT) automatically and accurately extracted data elements from ClinicalTrials.gov and thus reduced time in data extraction. The ClinicalTrials.gov data reproduced most meta-analysis results in our study, but this conclusion needs further validation.},
language = {eng},
journal = {Journal of Clinical Epidemiology},
author = {Pradhan, Richeek and Hoaglin, David C. and Cornell, Matthew and Liu, Weisong and Wang, Victoria and Yu, Hong},
month = jan,
year = {2019},
pmid = {30257185},
keywords = {Automatic data extraction, ClinicalTrials.gov, Meta-analysis, Reproducibility, Simeprevir, Systematic review, Trametinib, Vortioxetine},
pages = {92--100}
}

OBJECTIVES: Systematic reviews and meta-analyses are labor-intensive and time-consuming. Automated extraction of quantitative data from primary studies can accelerate this process. ClinicalTrials.gov, launched in 2000, is the world's largest trial repository of results data from clinical trials; it has been used as a source instead of journal articles. We have developed a Web application called EXACT (EXtracting Accurate efficacy and safety information from ClinicalTrials.gov) that allows users without advanced programming skills to automatically extract data from ClinicalTrials.gov in analysis-ready format. We have also used the automatically extracted data to examine the reproducibility of meta-analyses in three published systematic reviews. STUDY DESIGN AND SETTING: We developed a Python-based software application (EXACT) that automatically extracts data required for meta-analysis from the ClinicalTrials.gov database in a spreadsheet format. We confirmed the accuracy of the extracted data and then used those data to repeat meta-analyses in three published systematic reviews. To ensure that we used the same statistical methods and outcomes as the published systematic reviews, we repeated the meta-analyses using data manually extracted from the relevant journal articles. For the outcomes whose results we were able to reproduce using those journal article data, we examined the usability of ClinicalTrials.gov data. RESULTS: EXACT extracted data at ClincalTrials.gov with 100% accuracy, and it required 60% less time than the usual practice of manually extracting data from journal articles. We found that 87% of the data elements extracted using EXACT matched those extracted manually from the journal articles. We were able to reproduce 24 of 28 outcomes using the journal article data. Of these 24 outcomes, we were able to reproduce 83.3% of the published estimates using data at ClinicalTrials.gov. CONCLUSION: EXACT (http://bio-nlp.org/EXACT) automatically and accurately extracted data elements from ClinicalTrials.gov and thus reduced time in data extraction. The ClinicalTrials.gov data reproduced most meta-analysis results in our study, but this conclusion needs further validation.

@article{munkhdalai_clinical_2018,
title = {Clinical {Relation} {Extraction} {Toward} {Drug} {Safety} {Surveillance} {Using} {Electronic} {Health} {Record} {Narratives}: {Classical} {Learning} {Versus} {Deep} {Learning}},
volume = {4},
issn = {2369-2960},
shorttitle = {Clinical {Relation} {Extraction} {Toward} {Drug} {Safety} {Surveillance} {Using} {Electronic} {Health} {Record} {Narratives}},
doi = {10.2196/publichealth.9361},
abstract = {BACKGROUND: Medication and adverse drug event (ADE) information extracted from electronic health record (EHR) notes can be a rich resource for drug safety surveillance. Existing observational studies have mainly relied on structured EHR data to obtain ADE information; however, ADEs are often buried in the EHR narratives and not recorded in structured data.
OBJECTIVE: To unlock ADE-related information from EHR narratives, there is a need to extract relevant entities and identify relations among them. In this study, we focus on relation identification. This study aimed to evaluate natural language processing and machine learning approaches using the expert-annotated medical entities and relations in the context of drug safety surveillance, and investigate how different learning approaches perform under different configurations.
METHODS: We have manually annotated 791 EHR notes with 9 named entities (eg, medication, indication, severity, and ADEs) and 7 different types of relations (eg, medication-dosage, medication-ADE, and severity-ADE). Then, we explored 3 supervised machine learning systems for relation identification: (1) a support vector machines (SVM) system, (2) an end-to-end deep neural network system, and (3) a supervised descriptive rule induction baseline system. For the neural network system, we exploited the state-of-the-art recurrent neural network (RNN) and attention models. We report the performance by macro-averaged precision, recall, and F1-score across the relation types.
RESULTS: Our results show that the SVM model achieved the best average F1-score of 89.1\% on test data, outperforming the long short-term memory (LSTM) model with attention (F1-score of 65.72\%) as well as the rule induction baseline system (F1-score of 7.47\%) by a large margin. The bidirectional LSTM model with attention achieved the best performance among different RNN models. With the inclusion of additional features in the LSTM model, its performance can be boosted to an average F1-score of 77.35\%.
CONCLUSIONS: It shows that classical learning models (SVM) remains advantageous over deep learning models (RNN variants) for clinical relation identification, especially for long-distance intersentential relations. However, RNNs demonstrate a great potential of significant improvement if more training data become available. Our work is an important step toward mining EHRs to improve the efficacy of drug safety surveillance. Most importantly, the annotated data used in this study will be made publicly available, which will further promote drug safety research in the community.},
language = {eng},
number = {2},
journal = {JMIR public health and surveillance},
author = {Munkhdalai, Tsendsuren and Liu, Feifan and Yu, Hong},
month = apr,
year = {2018},
pmid = {29695376 PMCID: PMC5943628},
keywords = {drug-related side effects and adverse reactions, electronic health records, medical informatics applications, natural language processing, neural networks},
pages = {e29}
}

BACKGROUND: Medication and adverse drug event (ADE) information extracted from electronic health record (EHR) notes can be a rich resource for drug safety surveillance. Existing observational studies have mainly relied on structured EHR data to obtain ADE information; however, ADEs are often buried in the EHR narratives and not recorded in structured data. OBJECTIVE: To unlock ADE-related information from EHR narratives, there is a need to extract relevant entities and identify relations among them. In this study, we focus on relation identification. This study aimed to evaluate natural language processing and machine learning approaches using the expert-annotated medical entities and relations in the context of drug safety surveillance, and investigate how different learning approaches perform under different configurations. METHODS: We have manually annotated 791 EHR notes with 9 named entities (eg, medication, indication, severity, and ADEs) and 7 different types of relations (eg, medication-dosage, medication-ADE, and severity-ADE). Then, we explored 3 supervised machine learning systems for relation identification: (1) a support vector machines (SVM) system, (2) an end-to-end deep neural network system, and (3) a supervised descriptive rule induction baseline system. For the neural network system, we exploited the state-of-the-art recurrent neural network (RNN) and attention models. We report the performance by macro-averaged precision, recall, and F1-score across the relation types. RESULTS: Our results show that the SVM model achieved the best average F1-score of 89.1% on test data, outperforming the long short-term memory (LSTM) model with attention (F1-score of 65.72%) as well as the rule induction baseline system (F1-score of 7.47%) by a large margin. The bidirectional LSTM model with attention achieved the best performance among different RNN models. With the inclusion of additional features in the LSTM model, its performance can be boosted to an average F1-score of 77.35%. CONCLUSIONS: It shows that classical learning models (SVM) remains advantageous over deep learning models (RNN variants) for clinical relation identification, especially for long-distance intersentential relations. However, RNNs demonstrate a great potential of significant improvement if more training data become available. Our work is an important step toward mining EHRs to improve the efficacy of drug safety surveillance. Most importantly, the annotated data used in this study will be made publicly available, which will further promote drug safety research in the community.

BACKGROUND: Many health care systems now allow patients to access their electronic health record (EHR) notes online through patient portals. Medical jargon in EHR notes can confuse patients, which may interfere with potential benefits of patient access to EHR notes. OBJECTIVE: The aim of this study was to develop and evaluate the usability and content quality of NoteAid, a Web-based natural language processing system that links medical terms in EHR notes to lay definitions, that is, definitions easily understood by lay people. METHODS: NoteAid incorporates two core components: CoDeMed, a lexical resource of lay definitions for medical terms, and MedLink, a computational unit that links medical terms to lay definitions. We developed innovative computational methods, including an adapted distant supervision algorithm to prioritize medical terms important for EHR comprehension to facilitate the effort of building CoDeMed. Ten physician domain experts evaluated the user interface and content quality of NoteAid. The evaluation protocol included a cognitive walkthrough session and a postsession questionnaire. Physician feedback sessions were audio-recorded. We used standard content analysis methods to analyze qualitative data from these sessions. RESULTS: Physician feedback was mixed. Positive feedback on NoteAid included (1) Easy to use, (2) Good visual display, (3) Satisfactory system speed, and (4) Adequate lay definitions. Opportunities for improvement arising from evaluation sessions and feedback included (1) improving the display of definitions for partially matched terms, (2) including more medical terms in CoDeMed, (3) improving the handling of terms whose definitions vary depending on different contexts, and (4) standardizing the scope of definitions for medicines. On the basis of these results, we have improved NoteAid's user interface and a number of definitions, and added 4502 more definitions in CoDeMed. CONCLUSIONS: Physician evaluation yielded useful feedback for content validation and refinement of this innovative tool that has the potential to improve patient EHR comprehension and experience using patient portals. Future ongoing work will develop algorithms to handle ambiguous medical terms and test and evaluate NoteAid with patients.

In this paper, we propose a novel neural network architecture for clinical text mining. We formulate this hybrid neural network model (HNN), composed of recurrent neural network and deep residual network, to jointly predict the presence and period assertion values associated with medical events in clinical texts. We evaluate the effectiveness of our model on a corpus of expert-annotated longitudinal Electronic Health Records (EHR) notes from Cancer patients. Our experiments show that HNN improves the joint assertion classification accuracy as compared to conventional baselines.

@article{zheng_assessing_2018,
title = {Assessing the {Readability} of {Medical} {Documents}: {A} {Ranking} {Approach}},
volume = {6},
issn = {2291-9694},
shorttitle = {Assessing the {Readability} of {Medical} {Documents}},
doi = {10.2196/medinform.8611},
abstract = {BACKGROUND: The use of electronic health record (EHR) systems with patient engagement capabilities, including viewing, downloading, and transmitting health information, has recently grown tremendously. However, using these resources to engage patients in managing their own health remains challenging due to the complex and technical nature of the EHR narratives.
OBJECTIVE: Our objective was to develop a machine learning-based system to assess readability levels of complex documents such as EHR notes.
METHODS: We collected difficulty ratings of EHR notes and Wikipedia articles using crowdsourcing from 90 readers. We built a supervised model to assess readability based on relative orders of text difficulty using both surface text features and word embeddings. We evaluated system performance using the Kendall coefficient of concordance against human ratings.
RESULTS: Our system achieved significantly higher concordance (.734) with human annotators than did a baseline using the Flesch-Kincaid Grade Level, a widely adopted readability formula (.531). The improvement was also consistent across different disease topics. This method's concordance with an individual human user's ratings was also higher than the concordance between different human annotators (.658).
CONCLUSIONS: We explored methods to automatically assess the readability levels of clinical narratives. Our ranking-based system using simple textual features and easy-to-learn word embeddings outperformed a widely used readability formula. Our ranking-based method can predict relative difficulties of medical documents. It is not constrained to a predefined set of readability levels, a common design in many machine learning-based systems. Furthermore, the feature set does not rely on complex processing of the documents. One potential application of our readability ranking is personalization, allowing patients to better accommodate their own background knowledge.},
language = {eng},
number = {1},
journal = {JMIR medical informatics},
author = {Zheng, Jiaping and Yu, Hong},
month = mar,
year = {2018},
pmid = {29572199},
pmcid = {PMC5889493},
keywords = {comprehension, electronic health records, machine learning, readability},
pages = {e17}
}

BACKGROUND: The use of electronic health record (EHR) systems with patient engagement capabilities, including viewing, downloading, and transmitting health information, has recently grown tremendously. However, using these resources to engage patients in managing their own health remains challenging due to the complex and technical nature of the EHR narratives. OBJECTIVE: Our objective was to develop a machine learning-based system to assess readability levels of complex documents such as EHR notes. METHODS: We collected difficulty ratings of EHR notes and Wikipedia articles using crowdsourcing from 90 readers. We built a supervised model to assess readability based on relative orders of text difficulty using both surface text features and word embeddings. We evaluated system performance using the Kendall coefficient of concordance against human ratings. RESULTS: Our system achieved significantly higher concordance (.734) with human annotators than did a baseline using the Flesch-Kincaid Grade Level, a widely adopted readability formula (.531). The improvement was also consistent across different disease topics. This method's concordance with an individual human user's ratings was also higher than the concordance between different human annotators (.658). CONCLUSIONS: We explored methods to automatically assess the readability levels of clinical narratives. Our ranking-based system using simple textual features and easy-to-learn word embeddings outperformed a widely used readability formula. Our ranking-based method can predict relative difficulties of medical documents. It is not constrained to a predefined set of readability levels, a common design in many machine learning-based systems. Furthermore, the feature set does not rely on complex processing of the documents. One potential application of our readability ranking is personalization, allowing patients to better accommodate their own background knowledge.

@inproceedings{lalor_understanding_2018,
title = {Understanding {Deep} {Learning} {Performance} through an {Examination} of {Test} {Set} {Difficulty}: {A} {Psychometric} {Case} {Study}},
url = {https://arxiv.org/abs/1702.04811v3},
doi = {DOI: 10.18653/v1/D18-1500},
abstract = {Interpreting the performance of deep learning models beyond test set accuracy is challenging. Characteristics of individual data points are often not considered during evaluation, and each data point is treated equally. We examine the impact of a test set question's difficulty to determine if there is a relationship between difficulty and performance. We model difficulty using well-studied psychometric methods on human response patterns. Experiments on Natural Language Inference (NLI) and Sentiment Analysis (SA) show that the likelihood of answering a question correctly is impacted by the question's difficulty. As DNNs are trained with more data, easy examples are learned more quickly than hard examples.},
booktitle = {{EMNLP}},
author = {Lalor, John and Wu, Hao and Munkhdalai, Tsendsuren and Yu, Hong},
year = {2018}
}

Interpreting the performance of deep learning models beyond test set accuracy is challenging. Characteristics of individual data points are often not considered during evaluation, and each data point is treated equally. We examine the impact of a test set question's difficulty to determine if there is a relationship between difficulty and performance. We model difficulty using well-studied psychometric methods on human response patterns. Experiments on Natural Language Inference (NLI) and Sentiment Analysis (SA) show that the likelihood of answering a question correctly is impacted by the question's difficulty. As DNNs are trained with more data, easy examples are learned more quickly than hard examples.

@inproceedings{lalor_soft_2018,
title = {Soft {Label} {Memorization}-{Generalization} for {Natural} {Language} {Inference}.},
url = {https://arxiv.org/abs/1702.08563v3},
abstract = {Often when multiple labels are obtained for a training example it is assumed that there is an element of noise that must be accounted for. It has been shown that this disagreement can be considered signal instead of noise. In this work we investigate using soft labels for training data to improve generalization in machine learning models. However, using soft labels for training Deep Neural Networks (DNNs) is not practical due to the costs involved in obtaining multiple labels for large data sets. We propose soft label memorization-generalization (SLMG), a fine-tuning approach to using soft labels for training DNNs. We assume that differences in labels provided by human annotators represent ambiguity about the true label instead of noise. Experiments with SLMG demonstrate improved generalization performance on the Natural Language Inference (NLI) task. Our experiments show that by injecting a small percentage of soft label training data (0.03\% of training set size) we can improve generalization performance over several baselines.},
author = {Lalor, John and Wu, Hao and Yu, Hong},
year = {2018}
}

Often when multiple labels are obtained for a training example it is assumed that there is an element of noise that must be accounted for. It has been shown that this disagreement can be considered signal instead of noise. In this work we investigate using soft labels for training data to improve generalization in machine learning models. However, using soft labels for training Deep Neural Networks (DNNs) is not practical due to the costs involved in obtaining multiple labels for large data sets. We propose soft label memorization-generalization (SLMG), a fine-tuning approach to using soft labels for training DNNs. We assume that differences in labels provided by human annotators represent ambiguity about the true label instead of noise. Experiments with SLMG demonstrate improved generalization performance on the Natural Language Inference (NLI) task. Our experiments show that by injecting a small percentage of soft label training data (0.03% of training set size) we can improve generalization performance over several baselines.

@inproceedings{vu_sentence_2018,
title = {Sentence {Simplification} with {Memory}-{Augmented} {Neural} {Networks}},
doi = {DOI:10.18653/v1/N18-2013},
abstract = {Sentence simplification aims to simplify the content and structure of complex sentences, and thus make them easier to interpret for human readers, and easier to process for downstream NLP applications. Recent advances in neural machine translation have paved the way for novel approaches to the task. In this paper, we adapt an architecture with augmented memory capacities called Neural Semantic Encoders (Munkhdalai and Yu, 2017) for sentence simplification. Our experiments demonstrate the effectiveness of our approach on different simplification datasets, both in terms of automatic evaluation measures and human judgments.},
booktitle = {North {American} {Chapter} of the {Association} for {Computational} {Linguistics}: {Human} {Language} {Technologies}},
author = {Vu, Tu and Hu, Baotian and Munkhdalai, Tsendsuren and Yu, Hong},
year = {2018}
}

Sentence simplification aims to simplify the content and structure of complex sentences, and thus make them easier to interpret for human readers, and easier to process for downstream NLP applications. Recent advances in neural machine translation have paved the way for novel approaches to the task. In this paper, we adapt an architecture with augmented memory capacities called Neural Semantic Encoders (Munkhdalai and Yu, 2017) for sentence simplification. Our experiments demonstrate the effectiveness of our approach on different simplification datasets, both in terms of automatic evaluation measures and human judgments.

BACKGROUND: Atrial fibrillation (AF) is a common complication of acute myocardial infarction (AMI).The CHA2DS2VAScand CHADS2risk scoresare used to identifypatients with AF at risk for strokeand to guide oral anticoagulants (OAC) use, including patients with AMI. However, the epidemiology of AF, further stratifiedaccording to patients' risk of stroke, has not been wellcharacterized among those hospitalized for AMI. METHODS: We examined trends in the frequency of AF, rates of discharge OAC use, and post-discharge outcomes among 6,627 residents of the Worcester, Massachusetts area who survived hospitalization for AMI at 11 medical centers between 1997 and 2011. RESULTS: A total of 1,050AMI patients had AF (16%) andthe majority (91%)had a CHA2DS2VAScscore \textgreater2.AF rates were highest among patients in the highest stroke risk group.In comparison to patients without AF, patients with AMI and AF in the highest stroke risk category had higher rates of post-discharge complications, including higher 30-day re-hospitalization [27 % vs. 17 %], 30-day post-discharge death [10 % vs. 5%], and 1-year post-discharge death [46 % vs. 18 %] (p \textless 0.001 for all). Notably, fewerthan half of guideline-eligible AF patientsreceived an OACprescription at discharge. Usage rates for other evidence-based therapiessuch as statins and beta-blockers,lagged in comparison to AMI patients free from AF. CONCLUSIONS: Our findings highlight the need to enhance efforts towards stroke prevention among AMI survivors with AF.

@article{lalor_comprehenotes:_2018,
title = {{ComprehENotes}: {An} {Instrument} to {Assess} {Patient} {EHR} {Note} {Reading} {Comprehension} of {Electronic} {Health} {Record} {Notes}: {Development} and {Validation}},
doi = {DOI: 10.2196/jmir.9380},
abstract = {BACKGROUND:
Patient portals are widely adopted in the United States and allow millions of patients access to their electronic health records (EHRs), including their EHR clinical notes. A patient's ability to understand the information in the EHR is dependent on their overall health literacy. Although many tests of health literacy exist, none specifically focuses on EHR note comprehension.
OBJECTIVE:
The aim of this paper was to develop an instrument to assess patients' EHR note comprehension.
METHODS:
We identified 6 common diseases or conditions (heart failure, diabetes, cancer, hypertension, chronic obstructive pulmonary disease, and liver failure) and selected 5 representative EHR notes for each disease or condition. One note that did not contain natural language text was removed. Questions were generated from these notes using Sentence Verification Technique and were analyzed using item response theory (IRT) to identify a set of questions that represent a good test of ability for EHR note comprehension.
RESULTS:
Using Sentence Verification Technique, 154 questions were generated from the 29 EHR notes initially obtained. Of these, 83 were manually selected for inclusion in the Amazon Mechanical Turk crowdsourcing tasks and 55 were ultimately retained following IRT analysis. A follow-up validation with a second Amazon Mechanical Turk task and IRT analysis confirmed that the 55 questions test a latent ability dimension for EHR note comprehension. A short test of 14 items was created along with the 55-item test.
CONCLUSIONS:
We developed ComprehENotes, an instrument for assessing EHR note comprehension from existing EHR notes, gathered responses using crowdsourcing, and used IRT to analyze those responses, thus resulting in a set of questions to measure EHR note comprehension. Crowdsourced responses from Amazon Mechanical Turk can be used to estimate item parameters and select a subset of items for inclusion in the test set using IRT. The final set of questions is the first test of EHR note comprehension.},
journal = {The Journal of Medical Internet Research},
author = {Lalor, J and Wu, H and Chen, L and Mazor, K and Yu, H},
month = apr,
year = {2018},
pmid = {29695372 PMCID: PMC5943623}
}

BACKGROUND: Patient portals are widely adopted in the United States and allow millions of patients access to their electronic health records (EHRs), including their EHR clinical notes. A patient's ability to understand the information in the EHR is dependent on their overall health literacy. Although many tests of health literacy exist, none specifically focuses on EHR note comprehension. OBJECTIVE: The aim of this paper was to develop an instrument to assess patients' EHR note comprehension. METHODS: We identified 6 common diseases or conditions (heart failure, diabetes, cancer, hypertension, chronic obstructive pulmonary disease, and liver failure) and selected 5 representative EHR notes for each disease or condition. One note that did not contain natural language text was removed. Questions were generated from these notes using Sentence Verification Technique and were analyzed using item response theory (IRT) to identify a set of questions that represent a good test of ability for EHR note comprehension. RESULTS: Using Sentence Verification Technique, 154 questions were generated from the 29 EHR notes initially obtained. Of these, 83 were manually selected for inclusion in the Amazon Mechanical Turk crowdsourcing tasks and 55 were ultimately retained following IRT analysis. A follow-up validation with a second Amazon Mechanical Turk task and IRT analysis confirmed that the 55 questions test a latent ability dimension for EHR note comprehension. A short test of 14 items was created along with the 55-item test. CONCLUSIONS: We developed ComprehENotes, an instrument for assessing EHR note comprehension from existing EHR notes, gathered responses using crowdsourcing, and used IRT to analyze those responses, thus resulting in a set of questions to measure EHR note comprehension. Crowdsourced responses from Amazon Mechanical Turk can be used to estimate item parameters and select a subset of items for inclusion in the test set using IRT. The final set of questions is the first test of EHR note comprehension.

@article{li_extraction_2018,
title = {Extraction of {Information} {Related} to {Adverse} {Drug} {Events} from {Electronic} {Health} {Record} {Notes}: {Design} of an {End}-to-{End} {Model} {Based} on {Deep} {Learning}},
volume = {6},
issn = {2291-9694},
shorttitle = {Extraction of {Information} {Related} to {Adverse} {Drug} {Events} from {Electronic} {Health} {Record} {Notes}},
doi = {10.2196/12159},
abstract = {BACKGROUND: Pharmacovigilance and drug-safety surveillance are crucial for monitoring adverse drug events (ADEs), but the main ADE-reporting systems such as Food and Drug Administration Adverse Event Reporting System face challenges such as underreporting. Therefore, as complementary surveillance, data on ADEs are extracted from electronic health record (EHR) notes via natural language processing (NLP). As NLP develops, many up-to-date machine-learning techniques are introduced in this field, such as deep learning and multi-task learning (MTL). However, only a few studies have focused on employing such techniques to extract ADEs.
OBJECTIVE: We aimed to design a deep learning model for extracting ADEs and related information such as medications and indications. Since extraction of ADE-related information includes two steps-named entity recognition and relation extraction-our second objective was to improve the deep learning model using multi-task learning between the two steps.
METHODS: We employed the dataset from the Medication, Indication and Adverse Drug Events (MADE) 1.0 challenge to train and test our models. This dataset consists of 1089 EHR notes of cancer patients and includes 9 entity types such as Medication, Indication, and ADE and 7 types of relations between these entities. To extract information from the dataset, we proposed a deep-learning model that uses a bidirectional long short-term memory (BiLSTM) conditional random field network to recognize entities and a BiLSTM-Attention network to extract relations. To further improve the deep-learning model, we employed three typical MTL methods, namely, hard parameter sharing, parameter regularization, and task relation learning, to build three MTL models, called HardMTL, RegMTL, and LearnMTL, respectively.
RESULTS: Since extraction of ADE-related information is a two-step task, the result of the second step (ie, relation extraction) was used to compare all models. We used microaveraged precision, recall, and F1 as evaluation metrics. Our deep learning model achieved state-of-the-art results (F1=65.9\%), which is significantly higher than that (F1=61.7\%) of the best system in the MADE1.0 challenge. HardMTL further improved the F1 by 0.8\%, boosting the F1 to 66.7\%, whereas RegMTL and LearnMTL failed to boost the performance.
CONCLUSIONS: Deep learning models can significantly improve the performance of ADE-related information extraction. MTL may be effective for named entity recognition and relation extraction, but it depends on the methods, data, and other factors. Our results can facilitate research on ADE detection, NLP, and machine learning.},
language = {eng},
number = {4},
journal = {JMIR medical informatics},
author = {Li, Fei and Liu, Weisong and Yu, Hong},
month = nov,
year = {2018},
pmid = {30478023 PMCID: PMC6288593},
keywords = {adverse drug event, deep learning, multi-task learning, named entity recognition, natural language processing, relation extraction},
pages = {e12159}
}

BACKGROUND: Pharmacovigilance and drug-safety surveillance are crucial for monitoring adverse drug events (ADEs), but the main ADE-reporting systems such as Food and Drug Administration Adverse Event Reporting System face challenges such as underreporting. Therefore, as complementary surveillance, data on ADEs are extracted from electronic health record (EHR) notes via natural language processing (NLP). As NLP develops, many up-to-date machine-learning techniques are introduced in this field, such as deep learning and multi-task learning (MTL). However, only a few studies have focused on employing such techniques to extract ADEs. OBJECTIVE: We aimed to design a deep learning model for extracting ADEs and related information such as medications and indications. Since extraction of ADE-related information includes two steps-named entity recognition and relation extraction-our second objective was to improve the deep learning model using multi-task learning between the two steps. METHODS: We employed the dataset from the Medication, Indication and Adverse Drug Events (MADE) 1.0 challenge to train and test our models. This dataset consists of 1089 EHR notes of cancer patients and includes 9 entity types such as Medication, Indication, and ADE and 7 types of relations between these entities. To extract information from the dataset, we proposed a deep-learning model that uses a bidirectional long short-term memory (BiLSTM) conditional random field network to recognize entities and a BiLSTM-Attention network to extract relations. To further improve the deep-learning model, we employed three typical MTL methods, namely, hard parameter sharing, parameter regularization, and task relation learning, to build three MTL models, called HardMTL, RegMTL, and LearnMTL, respectively. RESULTS: Since extraction of ADE-related information is a two-step task, the result of the second step (ie, relation extraction) was used to compare all models. We used microaveraged precision, recall, and F1 as evaluation metrics. Our deep learning model achieved state-of-the-art results (F1=65.9%), which is significantly higher than that (F1=61.7%) of the best system in the MADE1.0 challenge. HardMTL further improved the F1 by 0.8%, boosting the F1 to 66.7%, whereas RegMTL and LearnMTL failed to boost the performance. CONCLUSIONS: Deep learning models can significantly improve the performance of ADE-related information extraction. MTL may be effective for named entity recognition and relation extraction, but it depends on the methods, data, and other factors. Our results can facilitate research on ADE detection, NLP, and machine learning.

@article{pradhan_inadequate_2018,
title = {Inadequate diversity of information resources searched in {US}-affiliated systematic reviews and meta-analyses: 2005-2016},
volume = {102},
issn = {1878-5921},
shorttitle = {Inadequate diversity of information resources searched in {US}-affiliated systematic reviews and meta-analyses},
doi = {10.1016/j.jclinepi.2018.05.024},
abstract = {OBJECTIVE: Systematic reviews and meta-analyses (SRMAs) rely upon comprehensive searches into diverse resources that catalog primary studies. However, since what constitutes a comprehensive search is unclear, we examined trends in databases searched from 2005-2016, surrounding the publication of search guidelines in 2013, and associations between resources searched and evidence of publication bias in SRMAs involving human subjects.
STUDY DESIGN: To ensure comparability of included SRMAs over the 12 years in the face of a near 100-fold increase of international SRMAs (mainly genetic studies from China) during this period, we focused on USA-affiliated SRMAs, manually reviewing 100 randomly selected SRMAs from those published in each year. After excluding articles (mainly for inadequate detail or out-of-scope methods), we identified factors associated with the databases searched, used network analysis to see which resources were simultaneously searched, and used logistic regression to link information sources searched with a lower chance of finding publication bias.
RESULTS: Among 817 SRMA articles studied, the common resources used were Medline (95\%), EMBASE (44\%), and Cochrane (41\%). Methods journal SRMAs were most likely to use registries and grey literature resources. We found substantial co-searching of resources with only published materials, and not complemented by searches of registries and the grey literature. The 2013 guideline did not substantially increase searching of registries and grey literature resources to retrieve primary studies for the SRMAs. When used to augment Medline, Scopus (in all SRMAs) and ClinicalTrials.gov (in SRMAs with safety outcomes) were negatively associated with publication bias.
CONCLUSIONS: Even SRMAs that search multiple sources tend to search similar resources. Our study supports searching Scopus and CTG in addition to Medline to reduce the chance of publication bias.},
language = {eng},
journal = {Journal of Clinical Epidemiology},
author = {Pradhan, Richeek and Garnick, Kyle and Barkondaj, Bikramjit and Jordan, Harmon S. and Ash, Arlene and Yu, Hong},
month = oct,
year = {2018},
pmid = {29879464},
pmcid = {PMC6250602},
keywords = {Evidence synthesis, Grey literature, Literature databases, Meta-analysis, Publication bias, Systematic review, Trial registries},
pages = {50--62}
}

OBJECTIVE: Systematic reviews and meta-analyses (SRMAs) rely upon comprehensive searches into diverse resources that catalog primary studies. However, since what constitutes a comprehensive search is unclear, we examined trends in databases searched from 2005-2016, surrounding the publication of search guidelines in 2013, and associations between resources searched and evidence of publication bias in SRMAs involving human subjects. STUDY DESIGN: To ensure comparability of included SRMAs over the 12 years in the face of a near 100-fold increase of international SRMAs (mainly genetic studies from China) during this period, we focused on USA-affiliated SRMAs, manually reviewing 100 randomly selected SRMAs from those published in each year. After excluding articles (mainly for inadequate detail or out-of-scope methods), we identified factors associated with the databases searched, used network analysis to see which resources were simultaneously searched, and used logistic regression to link information sources searched with a lower chance of finding publication bias. RESULTS: Among 817 SRMA articles studied, the common resources used were Medline (95%), EMBASE (44%), and Cochrane (41%). Methods journal SRMAs were most likely to use registries and grey literature resources. We found substantial co-searching of resources with only published materials, and not complemented by searches of registries and the grey literature. The 2013 guideline did not substantially increase searching of registries and grey literature resources to retrieve primary studies for the SRMAs. When used to augment Medline, Scopus (in all SRMAs) and ClinicalTrials.gov (in SRMAs with safety outcomes) were negatively associated with publication bias. CONCLUSIONS: Even SRMAs that search multiple sources tend to search similar resources. Our study supports searching Scopus and CTG in addition to Medline to reduce the chance of publication bias.

@article{lalor_comprehenotes_2018,
title = {{ComprehENotes}, an {Instrument} to {Assess} {Patient} {Reading} {Comprehension} of {Electronic} {Health} {Record} {Notes}: {Development} and {Validation}},
volume = {20},
issn = {1438-8871},
shorttitle = {{ComprehENotes}, an {Instrument} to {Assess} {Patient} {Reading} {Comprehension} of {Electronic} {Health} {Record} {Notes}},
doi = {10.2196/jmir.9380},
abstract = {BACKGROUND: Patient portals are widely adopted in the United States and allow millions of patients access to their electronic health records (EHRs), including their EHR clinical notes. A patient's ability to understand the information in the EHR is dependent on their overall health literacy. Although many tests of health literacy exist, none specifically focuses on EHR note comprehension.
OBJECTIVE: The aim of this paper was to develop an instrument to assess patients' EHR note comprehension.
METHODS: We identified 6 common diseases or conditions (heart failure, diabetes, cancer, hypertension, chronic obstructive pulmonary disease, and liver failure) and selected 5 representative EHR notes for each disease or condition. One note that did not contain natural language text was removed. Questions were generated from these notes using Sentence Verification Technique and were analyzed using item response theory (IRT) to identify a set of questions that represent a good test of ability for EHR note comprehension.
RESULTS: Using Sentence Verification Technique, 154 questions were generated from the 29 EHR notes initially obtained. Of these, 83 were manually selected for inclusion in the Amazon Mechanical Turk crowdsourcing tasks and 55 were ultimately retained following IRT analysis. A follow-up validation with a second Amazon Mechanical Turk task and IRT analysis confirmed that the 55 questions test a latent ability dimension for EHR note comprehension. A short test of 14 items was created along with the 55-item test.
CONCLUSIONS: We developed ComprehENotes, an instrument for assessing EHR note comprehension from existing EHR notes, gathered responses using crowdsourcing, and used IRT to analyze those responses, thus resulting in a set of questions to measure EHR note comprehension. Crowdsourced responses from Amazon Mechanical Turk can be used to estimate item parameters and select a subset of items for inclusion in the test set using IRT. The final set of questions is the first test of EHR note comprehension.},
language = {eng},
number = {4},
journal = {Journal of Medical Internet Research},
author = {Lalor, John P. and Wu, Hao and Chen, Li and Mazor, Kathleen M. and Yu, Hong},
month = apr,
year = {2018},
pmid = {29695372},
pmcid = {PMC5943623},
keywords = {crowdsourcing, electronic health records, health literacy, psychometrics},
pages = {e139}
}

BACKGROUND: Patient portals are widely adopted in the United States and allow millions of patients access to their electronic health records (EHRs), including their EHR clinical notes. A patient's ability to understand the information in the EHR is dependent on their overall health literacy. Although many tests of health literacy exist, none specifically focuses on EHR note comprehension. OBJECTIVE: The aim of this paper was to develop an instrument to assess patients' EHR note comprehension. METHODS: We identified 6 common diseases or conditions (heart failure, diabetes, cancer, hypertension, chronic obstructive pulmonary disease, and liver failure) and selected 5 representative EHR notes for each disease or condition. One note that did not contain natural language text was removed. Questions were generated from these notes using Sentence Verification Technique and were analyzed using item response theory (IRT) to identify a set of questions that represent a good test of ability for EHR note comprehension. RESULTS: Using Sentence Verification Technique, 154 questions were generated from the 29 EHR notes initially obtained. Of these, 83 were manually selected for inclusion in the Amazon Mechanical Turk crowdsourcing tasks and 55 were ultimately retained following IRT analysis. A follow-up validation with a second Amazon Mechanical Turk task and IRT analysis confirmed that the 55 questions test a latent ability dimension for EHR note comprehension. A short test of 14 items was created along with the 55-item test. CONCLUSIONS: We developed ComprehENotes, an instrument for assessing EHR note comprehension from existing EHR notes, gathered responses using crowdsourcing, and used IRT to analyze those responses, thus resulting in a set of questions to measure EHR note comprehension. Crowdsourced responses from Amazon Mechanical Turk can be used to estimate item parameters and select a subset of items for inclusion in the test set using IRT. The final set of questions is the first test of EHR note comprehension.

Background: Atrial fibrillation (AF) is a common complication of acute myocardial infarction (AMI).The CHA2DS2VAScand CHADS2risk scoresare used to identifypatients with AF at risk for strokeand to guide oral anticoagulants (OAC) use, including patients with AMI. However, the epidemiology of AF, further stratifiedaccording to patients' risk of stroke, has not been wellcharacterized among those hospitalized for AMI. Methods: We examined trends in the frequency of AF, rates of discharge OAC use, and post-discharge outcomes among 6,627 residents of the Worcester, Massachusetts area who survived hospitalization for AMI at 11 medical centers between 1997 and 2011. Results: A total of 1,050AMI patients had AF (16%) andthe majority (91%)had a CHA2DS2VAScscore \textgreater2.AF rates were highest among patients in the highest stroke risk group.In comparison to patients without AF, patients with AMI and AF in the highest stroke risk category had higher rates of post-discharge complications, including higher 30-day re-hospitalization [27 % vs. 17 %], 30-day post-discharge death [10 % vs. 5%], and 1-year post-discharge death [46 % vs. 18 %] (p \textless 0.001 for all). Notably, fewerthan half of guideline-eligible AF patientsreceived an OACprescription at discharge. Usage rates for other evidence-based therapiessuch as statins and beta-blockers,lagged in comparison to AMI patients free from AF. Conclusions: Our findings highlight the need to enhance efforts towards stroke prevention among AMI survivors with AF.

@inproceedings{munkhdalai_meta_2017,
address = {Sydney, Australia},
title = {Meta {Networks}},
volume = {70},
abstract = {Neural networks have been successfully applied in applications with a large amount of labeled data. However, the task of rapid generalization on new concepts with small training data while preserving performances on previously learned ones still presents a significant challenge to neural network models. In this work, we introduce a novel meta learning method, Meta Networks (MetaNet), that learns a meta-level knowledge across tasks and shifts its inductive biases via fast parameterization for rapid generalization. When evaluated on Omniglot and Mini-ImageNet benchmarks, our MetaNet models achieve a near human-level performance and outperform the baseline approaches by up to 6\% accuracy. We demonstrate several appealing properties of MetaNet relating to generalization and continual learning.},
booktitle = {{ICML}},
author = {Munkhdalai, Tsendsuren and Yu, Hong},
month = aug,
year = {2017},
pmid = {31106300; PMCID: PMC6519722},
pages = {2554--2563}
}

Neural networks have been successfully applied in applications with a large amount of labeled data. However, the task of rapid generalization on new concepts with small training data while preserving performances on previously learned ones still presents a significant challenge to neural network models. In this work, we introduce a novel meta learning method, Meta Networks (MetaNet), that learns a meta-level knowledge across tasks and shifts its inductive biases via fast parameterization for rapid generalization. When evaluated on Omniglot and Mini-ImageNet benchmarks, our MetaNet models achieve a near human-level performance and outperform the baseline approaches by up to 6% accuracy. We demonstrate several appealing properties of MetaNet relating to generalization and continual learning.

We present a memory augmented neural network for natural language understanding: Neural Semantic Encoders. NSE is equipped with a novel memory update rule and has a variable sized encoding memory that evolves over time and maintains the understanding of input sequences through readp̌hantom\\, compose and write operations. NSE can also access multiple and shared memories. In this paper, we demonstrated the effectiveness and the flexibility of NSE on five different natural language tasks: natural language inference, question answering, sentence classification, document sentiment analysis and machine translation where NSE achieved state-of-the-art performance when evaluated on publically available benchmarks. For example, our shared-memory model showed an encouraging result on neural machine translation, improving an attention-based baseline by approximately 1.0 BLEU.

@article{lingeman_detecting_2017,
title = {Detecting {Opioid}-{Related} {Aberrant} {Behavior} using {Natural} {Language} {Processing}},
volume = {2017},
issn = {1942-597X},
abstract = {The United States is in the midst of a prescription opioid epidemic, with the number of yearly opioid-related overdose deaths increasing almost fourfold since 20001. To more effectively prevent unintentional opioid overdoses, the medical profession requires robust surveillance tools that can effectively identify at-risk patients. Drug-related aberrant behaviors observed in the clinical context may be important indicators of patients at risk for or actively abusing opioids. In this paper, we describe a natural language processing (NLP) method for automatic surveillance of aberrant behavior in medical notes relying only on the text of the notes. This allows for a robust and generalizable system that can be used for high volume analysis of electronic medical records for potential predictors of opioid abuse.},
language = {eng},
journal = {AMIA ... Annual Symposium proceedings. AMIA Symposium},
author = {Lingeman, Jesse M. and Wang, Priscilla and Becker, William and Yu, Hong},
year = {2017},
pmid = {29854186 PMCID: PMC5977697},
pages = {1179--1185}
}

The United States is in the midst of a prescription opioid epidemic, with the number of yearly opioid-related overdose deaths increasing almost fourfold since 20001. To more effectively prevent unintentional opioid overdoses, the medical profession requires robust surveillance tools that can effectively identify at-risk patients. Drug-related aberrant behaviors observed in the clinical context may be important indicators of patients at risk for or actively abusing opioids. In this paper, we describe a natural language processing (NLP) method for automatic surveillance of aberrant behavior in medical notes relying only on the text of the notes. This allows for a robust and generalizable system that can be used for high volume analysis of electronic medical records for potential predictors of opioid abuse.

@inproceedings{lalor_cift:_2017,
title = {{CIFT}: {Crowd}-{Informed} {Fine}-{Tuning} to {Improve} {Machine} {Learning} {Ability}.},
abstract = {tem Response Theory (IRT) allows for measuring ability of Machine Learning models as compared to a human population. However, it is difficult to create a large dataset to train the ability of deep neural network models (DNNs). We propose Crowd-Informed Fine-Tuning (CIFT) as a new training process, where a pre-trained model is fine-tuned with a specialized supplemental training set obtained via IRT model-fitting on a large set of crowdsourced response patterns. With CIFT we can leverage the specialized set of data obtained through IRT to inform parameter tuning in DNNs. We experiment with two loss functions in CIFT to represent (i) memorization of fine-tuning items and (ii) learning a probability distribution over potential labels that is similar to the crowdsourced distribution over labels to simulate crowd knowledge. Our results show that CIFT improves ability for a state-of-the-art DNN model for Recognizing Textual Entailment (RTE) tasks and is generalizable to a large-scale RTE test set.},
author = {Lalor, J and Wu, H and Yu, H},
month = feb,
year = {2017}
}

tem Response Theory (IRT) allows for measuring ability of Machine Learning models as compared to a human population. However, it is difficult to create a large dataset to train the ability of deep neural network models (DNNs). We propose Crowd-Informed Fine-Tuning (CIFT) as a new training process, where a pre-trained model is fine-tuned with a specialized supplemental training set obtained via IRT model-fitting on a large set of crowdsourced response patterns. With CIFT we can leverage the specialized set of data obtained through IRT to inform parameter tuning in DNNs. We experiment with two loss functions in CIFT to represent (i) memorization of fine-tuning items and (ii) learning a probability distribution over potential labels that is similar to the crowdsourced distribution over labels to simulate crowd knowledge. Our results show that CIFT improves ability for a state-of-the-art DNN model for Recognizing Textual Entailment (RTE) tasks and is generalizable to a large-scale RTE test set.

@article{munkhdalai_reasoning_2017,
title = {Reasoning with memory augmented neural networks for language comprehension.},
url = {https://arxiv.org/abs/1610.06454},
abstract = {Hypothesis testing is an important cognitive process that supports human reasoning. In this paper, we introduce a computational hypothesis testing approach based on memory augmented neural networks. Our approach involves a hypothesis testing loop that reconsiders and progressively refines a previously formed hypothesis in order to generate new hypotheses to test. We apply the proposed approach to language comprehension task by using Neural Semantic Encoders (NSE). Our NSE models achieve the state-of-the-art results showing an absolute improvement of 1.2\% to 2.6\% accuracy over previous results obtained by single and ensemble systems on standard machine comprehension benchmarks such as the Children's Book Test (CBT) and Who-Did-What (WDW) news article datasets.},
urldate = {2017-06-02},
journal = {5th International Conference on Learning Representations (ICLR)},
author = {Munkhdalai, Tsendsuren and Yu, Hong},
year = {2017}
}

Hypothesis testing is an important cognitive process that supports human reasoning. In this paper, we introduce a computational hypothesis testing approach based on memory augmented neural networks. Our approach involves a hypothesis testing loop that reconsiders and progressively refines a previously formed hypothesis in order to generate new hypotheses to test. We apply the proposed approach to language comprehension task by using Neural Semantic Encoders (NSE). Our NSE models achieve the state-of-the-art results showing an absolute improvement of 1.2% to 2.6% accuracy over previous results obtained by single and ensemble systems on standard machine comprehension benchmarks such as the Children's Book Test (CBT) and Who-Did-What (WDW) news article datasets.

@article{zheng_readability_2017,
title = {Readability {Formulas} and {User} {Perceptions} of {Electronic} {Health} {Records} {Difficulty}: {A} {Corpus} {Study}},
volume = {19},
copyright = {Unless stated otherwise, all articles are open-access distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/2.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work (},
shorttitle = {Readability {Formulas} and {User} {Perceptions} of {Electronic} {Health} {Records} {Difficulty}},
url = {https://www.jmir.org/2017/3/e59/},
doi = {10.2196/jmir.6962},
abstract = {Background: Electronic health records (EHRs) are a rich resource for developing applications to engage patients and foster patient activation, thus holding a strong potential to enhance patient-centered care. Studies have shown that providing patients with access to their own EHR notes may improve the understanding of their own clinical conditions and treatments, leading to improved health care outcomes. However, the highly technical language in EHR notes impedes patients’ comprehension. Numerous studies have evaluated the difficulty of health-related text using readability formulas such as Flesch-Kincaid Grade Level (FKGL), Simple Measure of Gobbledygook (SMOG), and Gunning-Fog Index (GFI). They conclude that the materials are often written at a grade level higher than common recommendations. Objective: The objective of our study was to explore the relationship between the aforementioned readability formulas and the laypeople’s perceived difficulty on 2 genres of text: general health information and EHR notes. We also validated the formulas’ appropriateness and generalizability on predicting difficulty levels of highly complex technical documents. Methods: We collected 140 Wikipedia articles on diabetes and 242 EHR notes with diabetes International Classification of Diseases, Ninth Revision code. We recruited 15 Amazon Mechanical Turk (AMT) users to rate difficulty levels of the documents. Correlations between laypeople’s perceived difficulty levels and readability formula scores were measured, and their difference was tested. We also compared word usage and the impact of medical concepts of the 2 genres of text. Results: The distributions of both readability formulas’ scores (P{\textless}.001) and laypeople’s perceptions (P=.002) on the 2 genres were different. Correlations of readability predictions and laypeople’s perceptions were weak. Furthermore, despite being graded at similar levels, documents of different genres were still perceived with different difficulty (P{\textless}.001). Word usage in the 2 related genres still differed significantly (P{\textless}.001). Conclusions: Our findings suggested that the readability formulas’ predictions did not align with perceived difficulty in either text genre. The widely used readability formulas were highly correlated with each other but did not show adequate correlation with readers’ perceived difficulty. Therefore, they were not appropriate to assess the readability of EHR notes. [J Med Internet Res 2017;19(3):e59]},
language = {en},
number = {3},
urldate = {2017-03-06},
journal = {Journal of Medical Internet Research},
author = {Zheng, Jiaping and Yu, Hong},
year = {2017},
pmid = {28254738 PMCID: PMC5355629},
pages = {e59}
}

Background: Electronic health records (EHRs) are a rich resource for developing applications to engage patients and foster patient activation, thus holding a strong potential to enhance patient-centered care. Studies have shown that providing patients with access to their own EHR notes may improve the understanding of their own clinical conditions and treatments, leading to improved health care outcomes. However, the highly technical language in EHR notes impedes patients’ comprehension. Numerous studies have evaluated the difficulty of health-related text using readability formulas such as Flesch-Kincaid Grade Level (FKGL), Simple Measure of Gobbledygook (SMOG), and Gunning-Fog Index (GFI). They conclude that the materials are often written at a grade level higher than common recommendations. Objective: The objective of our study was to explore the relationship between the aforementioned readability formulas and the laypeople’s perceived difficulty on 2 genres of text: general health information and EHR notes. We also validated the formulas’ appropriateness and generalizability on predicting difficulty levels of highly complex technical documents. Methods: We collected 140 Wikipedia articles on diabetes and 242 EHR notes with diabetes International Classification of Diseases, Ninth Revision code. We recruited 15 Amazon Mechanical Turk (AMT) users to rate difficulty levels of the documents. Correlations between laypeople’s perceived difficulty levels and readability formula scores were measured, and their difference was tested. We also compared word usage and the impact of medical concepts of the 2 genres of text. Results: The distributions of both readability formulas’ scores (P\textless.001) and laypeople’s perceptions (P=.002) on the 2 genres were different. Correlations of readability predictions and laypeople’s perceptions were weak. Furthermore, despite being graded at similar levels, documents of different genres were still perceived with different difficulty (P\textless.001). Word usage in the 2 related genres still differed significantly (P\textless.001). Conclusions: Our findings suggested that the readability formulas’ predictions did not align with perceived difficulty in either text genre. The widely used readability formulas were highly correlated with each other but did not show adequate correlation with readers’ perceived difficulty. Therefore, they were not appropriate to assess the readability of EHR notes. [J Med Internet Res 2017;19(3):e59]

@inproceedings{munkhdalai_neural_2017-1,
address = {Valencia, Spain},
title = {Neural {Tree} {Indexers} for {Text} {Understanding}},
url = {http://www.aclweb.org/anthology/E17-1002},
abstract = {Recurrent neural networks (RNNs) process input text sequentially and model the conditional transition between word tokens. In contrast, the advantages of recursive networks include that they explicitly model the compositionality and the recursive structure of natural language. However, the current recursive architecture is limited by its dependence on syntactic tree. In this paper, we introduce a robust syntactic parsing-independent tree structured model, Neural Tree Indexers (NTI) that provides a middle ground between the sequential RNNs and the syntactic treebased recursive models. NTI constructs a full n-ary tree by processing the input text with its node function in a bottom-up fashion. Attention mechanism can then be applied to both structure and node function. We implemented and evaluated a binary tree model of NTI, showing the model achieved the state-of-the-art performance on three different NLP tasks: natural language inference, answer sentence selection, and sentence classification, outperforming state-of-the-art recurrent and recursive neural networks.},
urldate = {2017-04-02},
booktitle = {Proceedings of the 15th {Conference} of the {European} {Chapter} of the {Association} for {Computational} {Linguistics}: {Volume} 1, {Long} {Papers}},
publisher = {Association for Computational Linguistics},
author = {Munkhdalai, Tsendsuren and Yu, Hong},
month = apr,
year = {2017},
pages = {11--21}
}

Recurrent neural networks (RNNs) process input text sequentially and model the conditional transition between word tokens. In contrast, the advantages of recursive networks include that they explicitly model the compositionality and the recursive structure of natural language. However, the current recursive architecture is limited by its dependence on syntactic tree. In this paper, we introduce a robust syntactic parsing-independent tree structured model, Neural Tree Indexers (NTI) that provides a middle ground between the sequential RNNs and the syntactic treebased recursive models. NTI constructs a full n-ary tree by processing the input text with its node function in a bottom-up fashion. Attention mechanism can then be applied to both structure and node function. We implemented and evaluated a binary tree model of NTI, showing the model achieved the state-of-the-art performance on three different NLP tasks: natural language inference, answer sentence selection, and sentence classification, outperforming state-of-the-art recurrent and recursive neural networks.

@inproceedings{lalor_generating_2017,
title = {Generating a {Test} of {Electronic} {Health} {Record} {Narrative} {Comprehension} with {Item} {Response} {Theory}.},
abstract = {In this work, we report the development of a new instrument to test patients' ability to comprehend EHR notes. Our instrument comprises of a test set of question and answer pairs that are based on the semantic content of EHR notes and selected using the psychometrics method Item Response Theory.},
author = {Lalor, J and Wu, H and Chen, L and Mazor, K and Yu, H},
month = nov,
year = {2017}
}

In this work, we report the development of a new instrument to test patients' ability to comprehend EHR notes. Our instrument comprises of a test set of question and answer pairs that are based on the semantic content of EHR notes and selected using the psychometrics method Item Response Theory.

@article{chen_ranking_2017,
title = {Ranking {Medical} {Terms} to {Support} {Expansion} of {Lay} {Language} {Resources} for {Patient} {Comprehension} of {Electronic} {Health} {Record} {Notes}: {Adapted} {Distant} {Supervision} {Approach}},
volume = {5},
issn = {2291-9694},
shorttitle = {Ranking {Medical} {Terms} to {Support} {Expansion} of {Lay} {Language} {Resources} for {Patient} {Comprehension} of {Electronic} {Health} {Record} {Notes}},
doi = {10.2196/medinform.8531},
abstract = {BACKGROUND: Medical terms are a major obstacle for patients to comprehend their electronic health record (EHR) notes. Clinical natural language processing (NLP) systems that link EHR terms to lay terms or definitions allow patients to easily access helpful information when reading through their EHR notes, and have shown to improve patient EHR comprehension. However, high-quality lay language resources for EHR terms are very limited in the public domain. Because expanding and curating such a resource is a costly process, it is beneficial and even necessary to identify terms important for patient EHR comprehension first.
OBJECTIVE: We aimed to develop an NLP system, called adapted distant supervision (ADS), to rank candidate terms mined from EHR corpora. We will give EHR terms ranked as high by ADS a higher priority for lay language annotation-that is, creating lay definitions for these terms.
METHODS: Adapted distant supervision uses distant supervision from consumer health vocabulary and transfer learning to adapt itself to solve the problem of ranking EHR terms in the target domain. We investigated 2 state-of-the-art transfer learning algorithms (ie, feature space augmentation and supervised distant supervision) and designed 5 types of learning features, including distributed word representations learned from large EHR data for ADS. For evaluating ADS, we asked domain experts to annotate 6038 candidate terms as important or nonimportant for EHR comprehension. We then randomly divided these data into the target-domain training data (1000 examples) and the evaluation data (5038 examples). We compared ADS with 2 strong baselines, including standard supervised learning, on the evaluation data.
RESULTS: The ADS system using feature space augmentation achieved the best average precision, 0.850, on the evaluation set when using 1000 target-domain training examples. The ADS system using supervised distant supervision achieved the best average precision, 0.819, on the evaluation set when using only 100 target-domain training examples. The 2 ADS systems both performed significantly better than the baseline systems (P{\textless}.001 for all measures and all conditions). Using a rich set of learning features contributed to ADS's performance substantially.
CONCLUSIONS: ADS can effectively rank terms mined from EHRs. Transfer learning improved ADS's performance even with a small number of target-domain training examples. EHR terms prioritized by ADS were used to expand a lay language resource that supports patient EHR comprehension. The top 10,000 EHR terms ranked by ADS are available upon request.},
language = {eng},
number = {4},
journal = {JMIR medical informatics},
author = {Chen, Jinying and Jagannatha, Abhyuday N. and Fodeh, Samah J. and Yu, Hong},
month = oct,
year = {2017},
pmid = {29089288},
pmcid = {PMC5686421},
keywords = {Information extraction, electronic health records, lexical entry selection, natural language processing, transfer learning},
pages = {e42}
}

BACKGROUND: Medical terms are a major obstacle for patients to comprehend their electronic health record (EHR) notes. Clinical natural language processing (NLP) systems that link EHR terms to lay terms or definitions allow patients to easily access helpful information when reading through their EHR notes, and have shown to improve patient EHR comprehension. However, high-quality lay language resources for EHR terms are very limited in the public domain. Because expanding and curating such a resource is a costly process, it is beneficial and even necessary to identify terms important for patient EHR comprehension first. OBJECTIVE: We aimed to develop an NLP system, called adapted distant supervision (ADS), to rank candidate terms mined from EHR corpora. We will give EHR terms ranked as high by ADS a higher priority for lay language annotation-that is, creating lay definitions for these terms. METHODS: Adapted distant supervision uses distant supervision from consumer health vocabulary and transfer learning to adapt itself to solve the problem of ranking EHR terms in the target domain. We investigated 2 state-of-the-art transfer learning algorithms (ie, feature space augmentation and supervised distant supervision) and designed 5 types of learning features, including distributed word representations learned from large EHR data for ADS. For evaluating ADS, we asked domain experts to annotate 6038 candidate terms as important or nonimportant for EHR comprehension. We then randomly divided these data into the target-domain training data (1000 examples) and the evaluation data (5038 examples). We compared ADS with 2 strong baselines, including standard supervised learning, on the evaluation data. RESULTS: The ADS system using feature space augmentation achieved the best average precision, 0.850, on the evaluation set when using 1000 target-domain training examples. The ADS system using supervised distant supervision achieved the best average precision, 0.819, on the evaluation set when using only 100 target-domain training examples. The 2 ADS systems both performed significantly better than the baseline systems (P\textless.001 for all measures and all conditions). Using a rich set of learning features contributed to ADS's performance substantially. CONCLUSIONS: ADS can effectively rank terms mined from EHRs. Transfer learning improved ADS's performance even with a small number of target-domain training examples. EHR terms prioritized by ADS were used to expand a lay language resource that supports patient EHR comprehension. The top 10,000 EHR terms ranked by ADS are available upon request.

@article{lalor_analysis_2017-1,
title = {An {Analysis} of {Machine} {Learning} {Intelligence}},
url = {http://arxiv.org/abs/1702.04811},
abstract = {Deep neural networks (DNNs) have set state of the art results in many machine learning and NLP tasks. However, we do not have a strong understanding of what DNN models learn. In this paper, we examine learning in DNNs through analysis of their outputs. We compare DNN performance directly to a human population, and use characteristics of individual data points such as difficulty to see how well models perform on easy and hard examples. We investigate how training size and the incorporation of noise affect a DNN's ability to generalize and learn. Our experiments show that unlike traditional machine learning models (e.g., Naive Bayes, Decision Trees), DNNs exhibit human-like learning properties. As they are trained with more data, they are more able to distinguish between easy and difficult items, and performance on easy items improves at a higher rate than difficult items. We find that different DNN models exhibit different strengths in learning and are robust to noise in training data.},
urldate = {2017-02-26},
journal = {arXiv:1702.04811 [cs]},
author = {Lalor, John P. and Wu, Hao and Munkhdalai, Tsendsuren and Yu, Hong},
month = feb,
year = {2017},
note = {arXiv: 1702.04811},
keywords = {Computer Science - Computation and Language}
}

Deep neural networks (DNNs) have set state of the art results in many machine learning and NLP tasks. However, we do not have a strong understanding of what DNN models learn. In this paper, we examine learning in DNNs through analysis of their outputs. We compare DNN performance directly to a human population, and use characteristics of individual data points such as difficulty to see how well models perform on easy and hard examples. We investigate how training size and the incorporation of noise affect a DNN's ability to generalize and learn. Our experiments show that unlike traditional machine learning models (e.g., Naive Bayes, Decision Trees), DNNs exhibit human-like learning properties. As they are trained with more data, they are more able to distinguish between easy and difficult items, and performance on easy items improves at a higher rate than difficult items. We find that different DNN models exhibit different strengths in learning and are robust to noise in training data.

@inproceedings{jagannatha_structured_2016,
title = {Structured prediction models for {RNN} based sequence labeling in clinical text},
volume = {2016},
abstract = {Sequence labeling is a widely used method for named entity recognition and information extraction from unstructured natural language data. In clinical domain one major application of sequence labeling involves extraction of medical entities such as medication, indication, and side-effects from Electronic Health Record narratives. Sequence labeling in this domain, presents its own set of challenges and objectives. In this work we experimented with various CRF based structured learning models with Recurrent Neural Networks. We extend the previously studied LSTM-CRF models with explicit modeling of pairwise potentials. We also propose an approximate version of skip-chain CRF inference with RNN potentials. We use these methodologies for structured prediction in order to improve the exact phrase detection of various medical entities.},
language = {eng},
booktitle = {Proceedings of the {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing}},
author = {Jagannatha, Abhyuday N. and Yu, Hong},
month = nov,
year = {2016},
pmid = {28004040 PMCID: PMC5167535},
keywords = {Computer Science - Computation and Language},
pages = {856--865}
}

Sequence labeling is a widely used method for named entity recognition and information extraction from unstructured natural language data. In clinical domain one major application of sequence labeling involves extraction of medical entities such as medication, indication, and side-effects from Electronic Health Record narratives. Sequence labeling in this domain, presents its own set of challenges and objectives. In this work we experimented with various CRF based structured learning models with Recurrent Neural Networks. We extend the previously studied LSTM-CRF models with explicit modeling of pairwise potentials. We also propose an approximate version of skip-chain CRF inference with RNN potentials. We use these methodologies for structured prediction in order to improve the exact phrase detection of various medical entities.

@article{lingeman_learning_2016,
title = {Learning to {Rank} {Scientific} {Documents} from the {Crowd}},
url = {https://arxiv.org/pdf/1611.01400v1.pdf},
abstract = {Finding related published articles is an important task in any science, but with the explosion of new work in the biomedical domain it has become especially challenging. Most existing methodologies use text similarity metrics to identify whether two articles are related or not. However biomedical knowledge discovery is hypothesis-driven. The most related articles may not be ones with the highest text similarities. In this study, we first develop an innovative crowd-sourcing approach to build an expert-annotated document-ranking corpus. Using this corpus as the gold standard, we then evaluate the approaches of using text similarity to rank the relatedness of articles. Finally, we develop and evaluate a new supervised model to automatically rank related scientific articles. Our results show that authors' ranking differ significantly from rankings by text-similarity-based models. By training a learning-to-rank model on a subset of the annotated corpus, we found the best supervised learning-to-rank model (SVM-Rank) significantly surpassed state-of-the-art baseline systems.},
journal = {arXiv:1611.01400},
author = {Lingeman, Jesse M and Yu, Hong},
month = nov,
year = {2016}
}

Finding related published articles is an important task in any science, but with the explosion of new work in the biomedical domain it has become especially challenging. Most existing methodologies use text similarity metrics to identify whether two articles are related or not. However biomedical knowledge discovery is hypothesis-driven. The most related articles may not be ones with the highest text similarities. In this study, we first develop an innovative crowd-sourcing approach to build an expert-annotated document-ranking corpus. Using this corpus as the gold standard, we then evaluate the approaches of using text similarity to rank the relatedness of articles. Finally, we develop and evaluate a new supervised model to automatically rank related scientific articles. Our results show that authors' ranking differ significantly from rankings by text-similarity-based models. By training a learning-to-rank model on a subset of the annotated corpus, we found the best supervised learning-to-rank model (SVM-Rank) significantly surpassed state-of-the-art baseline systems.

@article{liu_learning_2016,
title = {Learning for {Biomedical} {Information} {Extraction}: {Methodological} {Review} of {Recent} {Advances}},
url = {https://arxiv.org/ftp/arxiv/papers/1606/1606.07993.pdf},
abstract = {Biomedical information extraction (BioIE) is important to many applications, including clinical decision support, integrative biology, and pharmacovigilance, and therefore it has been an active research. Unlike existing reviews covering a holistic view on BioIE, this review focuses on mainly recent advances in learning based approaches, by systematically summarizing them into different aspects of methodological development. In addition, we dive into open information extraction and deep learning, two emerging and influential techniques and envision next generation of BioIE.},
journal = {arXiv:1606.07993},
author = {Liu, Feifan and Chen, Jinying and Jagannatha, Abhyuday and Yu, Hong},
month = jun,
year = {2016}
}

Biomedical information extraction (BioIE) is important to many applications, including clinical decision support, integrative biology, and pharmacovigilance, and therefore it has been an active research. Unlike existing reviews covering a holistic view on BioIE, this review focuses on mainly recent advances in learning based approaches, by systematically summarizing them into different aspects of methodological development. In addition, we dive into open information extraction and deep learning, two emerging and influential techniques and envision next generation of BioIE.

@article{prakash_condensed_2016,
title = {Condensed {Memory} {Networks} for {Clinical} {Diagnostic} {Inferencing}},
url = {http://arxiv.org/abs/1612.01848},
abstract = {Diagnosis of a clinical condition is a challenging task, which often requires significant medical investigation. Previous work related to diagnostic inferencing problems mostly consider multivariate observational data (e.g. physiological signals, lab tests etc.). In contrast, we explore the problem using free-text medical notes recorded in an electronic health record (EHR). Complex tasks like these can benefit from structured knowledge bases, but those are not scalable. We instead exploit raw text from Wikipedia as a knowledge source. Memory networks have been demonstrated to be effective in tasks which require comprehension of free-form text. They use the final iteration of the learned representation to predict probable classes. We introduce condensed memory neural networks (C-MemNNs), a novel model with iterative condensation of memory representations that preserves the hierarchy of features in the memory. Experiments on the MIMIC-III dataset show that the proposed model outperforms other variants of memory networks to predict the most probable diagnoses given a complex clinical scenario.},
urldate = {2017-01-12},
journal = {arXiv:1612.01848 [cs]},
author = {Prakash, Aaditya and Zhao, Siyuan and Hasan, Sadid A. and Datla, Vivek and Lee, Kathy and Qadir, Ashequl and Liu, Joey and Farri, Oladimeji},
month = dec,
year = {2016},
note = {arXiv: 1612.01848},
keywords = {Computer Science - Computation and Language}
}

Diagnosis of a clinical condition is a challenging task, which often requires significant medical investigation. Previous work related to diagnostic inferencing problems mostly consider multivariate observational data (e.g. physiological signals, lab tests etc.). In contrast, we explore the problem using free-text medical notes recorded in an electronic health record (EHR). Complex tasks like these can benefit from structured knowledge bases, but those are not scalable. We instead exploit raw text from Wikipedia as a knowledge source. Memory networks have been demonstrated to be effective in tasks which require comprehension of free-form text. They use the final iteration of the learned representation to predict probable classes. We introduce condensed memory neural networks (C-MemNNs), a novel model with iterative condensation of memory representations that preserves the hierarchy of features in the memory. Experiments on the MIMIC-III dataset show that the proposed model outperforms other variants of memory networks to predict the most probable diagnoses given a complex clinical scenario.

@article{chen_finding_2016,
title = {Finding {Important} {Terms} for {Patients} in {Their} {Electronic} {Health} {Records}: {A} {Learning}-to-{Rank} {Approach} {Using} {Expert} {Annotations}},
volume = {4},
shorttitle = {Finding {Important} {Terms} for {Patients} in {Their} {Electronic} {Health} {Records}},
doi = {10.2196/medinform.6373},
abstract = {BACKGROUND: Many health organizations allow patients to access their own electronic health record (EHR) notes through online patient portals as a way to enhance patient-centered care. However, EHR notes are typically long and contain abundant medical jargon that can be difficult for patients to understand. In addition, many medical terms in patients' notes are not directly related to their health care needs. One way to help patients better comprehend their own notes is to reduce information overload and help them focus on medical terms that matter most to them. Interventions can then be developed by giving them targeted education to improve their EHR comprehension and the quality of care.
OBJECTIVE: We aimed to develop a supervised natural language processing (NLP) system called Finding impOrtant medical Concepts most Useful to patientS (FOCUS) that automatically identifies and ranks medical terms in EHR notes based on their importance to the patients.
METHODS: First, we built an expert-annotated corpus. For each EHR note, 2 physicians independently identified medical terms important to the patient. Using the physicians' agreement as the gold standard, we developed and evaluated FOCUS. FOCUS first identifies candidate terms from each EHR note using MetaMap and then ranks the terms using a support vector machine-based learn-to-rank algorithm. We explored rich learning features, including distributed word representation, Unified Medical Language System semantic type, topic features, and features derived from consumer health vocabulary. We compared FOCUS with 2 strong baseline NLP systems.
RESULTS: Physicians annotated 90 EHR notes and identified a mean of 9 (SD 5) important terms per note. The Cohen's kappa annotation agreement was .51. The 10-fold cross-validation results show that FOCUS achieved an area under the receiver operating characteristic curve (AUC-ROC) of 0.940 for ranking candidate terms from EHR notes to identify important terms. When including term identification, the performance of FOCUS for identifying important terms from EHR notes was 0.866 AUC-ROC. Both performance scores significantly exceeded the corresponding baseline system scores (P{\textless}.001). Rich learning features contributed to FOCUS's performance substantially.
CONCLUSIONS: FOCUS can automatically rank terms from EHR notes based on their importance to patients. It may help develop future interventions that improve quality of care.},
language = {eng},
number = {4},
journal = {JMIR medical informatics},
author = {Chen, Jinying and Zheng, Jiaping and Yu, Hong},
month = nov,
year = {2016},
pmid = {27903489},
pmcid = {PMC5156821},
keywords = {Information extraction, Learning to rank, Supervised learning, electronic health records, natural language processing},
pages = {e40}
}

BACKGROUND: Many health organizations allow patients to access their own electronic health record (EHR) notes through online patient portals as a way to enhance patient-centered care. However, EHR notes are typically long and contain abundant medical jargon that can be difficult for patients to understand. In addition, many medical terms in patients' notes are not directly related to their health care needs. One way to help patients better comprehend their own notes is to reduce information overload and help them focus on medical terms that matter most to them. Interventions can then be developed by giving them targeted education to improve their EHR comprehension and the quality of care. OBJECTIVE: We aimed to develop a supervised natural language processing (NLP) system called Finding impOrtant medical Concepts most Useful to patientS (FOCUS) that automatically identifies and ranks medical terms in EHR notes based on their importance to the patients. METHODS: First, we built an expert-annotated corpus. For each EHR note, 2 physicians independently identified medical terms important to the patient. Using the physicians' agreement as the gold standard, we developed and evaluated FOCUS. FOCUS first identifies candidate terms from each EHR note using MetaMap and then ranks the terms using a support vector machine-based learn-to-rank algorithm. We explored rich learning features, including distributed word representation, Unified Medical Language System semantic type, topic features, and features derived from consumer health vocabulary. We compared FOCUS with 2 strong baseline NLP systems. RESULTS: Physicians annotated 90 EHR notes and identified a mean of 9 (SD 5) important terms per note. The Cohen's kappa annotation agreement was .51. The 10-fold cross-validation results show that FOCUS achieved an area under the receiver operating characteristic curve (AUC-ROC) of 0.940 for ranking candidate terms from EHR notes to identify important terms. When including term identification, the performance of FOCUS for identifying important terms from EHR notes was 0.866 AUC-ROC. Both performance scores significantly exceeded the corresponding baseline system scores (P\textless.001). Rich learning features contributed to FOCUS's performance substantially. CONCLUSIONS: FOCUS can automatically rank terms from EHR notes based on their importance to patients. It may help develop future interventions that improve quality of care.

@article{lalor_building_2016,
title = {Building an {Evaluation} {Scale} using {Item} {Response} {Theory}},
url = {http://arxiv.org/abs/1605.08889},
abstract = {Evaluation of NLP methods requires testing against a previously vetted gold-standard test set and reporting standard metrics (accuracy/precision/recall/F1). The current assumption is that all items in a given test set are equal with regards to difficulty and discriminating power. We propose Item Response Theory (IRT) from psychometrics as an alternative means for gold-standard test-set generation and NLP system evaluation. IRT is able to describe characteristics of individual items - their difficulty and discriminating power - and can account for these characteristics in its estimation of human intelligence or ability for an NLP task. In this paper, we demonstrate IRT by generating a gold-standard test set for Recognizing Textual Entailment. By collecting a large number of human responses and fitting our IRT model, we show that our IRT model compares NLP systems with the performance in a human population and is able to provide more insight into system performance than standard evaluation metrics. We show that a high accuracy score does not always imply a high IRT score, which depends on the item characteristics and the response pattern.},
urldate = {2016-09-26},
journal = {arXiv:1605.08889 [cs]},
author = {Lalor, John P. and Wu, Hao and Yu, Hong},
month = may,
year = {2016},
note = {arXiv: 1605.08889},
keywords = {Computer Science - Computation and Language}
}

Evaluation of NLP methods requires testing against a previously vetted gold-standard test set and reporting standard metrics (accuracy/precision/recall/F1). The current assumption is that all items in a given test set are equal with regards to difficulty and discriminating power. We propose Item Response Theory (IRT) from psychometrics as an alternative means for gold-standard test-set generation and NLP system evaluation. IRT is able to describe characteristics of individual items - their difficulty and discriminating power - and can account for these characteristics in its estimation of human intelligence or ability for an NLP task. In this paper, we demonstrate IRT by generating a gold-standard test set for Recognizing Textual Entailment. By collecting a large number of human responses and fitting our IRT model, we show that our IRT model compares NLP systems with the performance in a human population and is able to provide more insight into system performance than standard evaluation metrics. We show that a high accuracy score does not always imply a high IRT score, which depends on the item characteristics and the response pattern.

Sequence labeling for extraction of medical events and their attributes from unstructured text in Electronic Health Record (EHR) notes is a key step towards semantic understanding of EHRs. It has important applications in health informatics including pharmacovigilance and drug surveillance. The state of the art supervised machine learning models in this domain are based on Conditional Random Fields (CRFs) with features calculated from fixed context windows. In this application, we explored various recurrent neural network frameworks and show that they significantly outperformed the CRF models.

@article{yin_detext:_2015,
title = {{DeTEXT}: {A} {Database} for {Evaluating} {Text} {Extraction} from {Biomedical} {Literature} {Figures}},
volume = {10},
issn = {1932-6203},
shorttitle = {{DeTEXT}},
url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4423993/},
doi = {10.1371/journal.pone.0126200},
abstract = {Hundreds of millions of figures are available in biomedical literature, representing important biomedical experimental evidence. Since text is a rich source of information in figures, automatically extracting such text may assist in the task of mining figure information. A high-quality ground truth standard can greatly facilitate the development of an automated system. This article describes DeTEXT: A database for evaluating text extraction from biomedical literature figures. It is the first publicly available, human-annotated, high quality, and large-scale figure-text dataset with 288 full-text articles, 500 biomedical figures, and 9308 text regions. This article describes how figures were selected from open-access full-text biomedical articles and how annotation guidelines and annotation tools were developed. We also discuss the inter-annotator agreement and the reliability of the annotations. We summarize the statistics of the DeTEXT data and make available evaluation protocols for DeTEXT. Finally we lay out challenges we observed in the automated detection and recognition of figure text and discuss research directions in this area. DeTEXT is publicly available for downloading at http://prir.ustb.edu.cn/DeTEXT/.},
number = {5},
urldate = {2015-06-03},
journal = {PLoS ONE},
author = {Yin, Xu-Cheng and Yang, Chun and Pei, Wei-Yi and Man, Haixia and Zhang, Jun and Learned-Miller, Erik and Yu, Hong},
month = may,
year = {2015},
pmid = {25951377 PMCID: PMC4423993}
}

Hundreds of millions of figures are available in biomedical literature, representing important biomedical experimental evidence. Since text is a rich source of information in figures, automatically extracting such text may assist in the task of mining figure information. A high-quality ground truth standard can greatly facilitate the development of an automated system. This article describes DeTEXT: A database for evaluating text extraction from biomedical literature figures. It is the first publicly available, human-annotated, high quality, and large-scale figure-text dataset with 288 full-text articles, 500 biomedical figures, and 9308 text regions. This article describes how figures were selected from open-access full-text biomedical articles and how annotation guidelines and annotation tools were developed. We also discuss the inter-annotator agreement and the reliability of the annotations. We summarize the statistics of the DeTEXT data and make available evaluation protocols for DeTEXT. Finally we lay out challenges we observed in the automated detection and recognition of figure text and discuss research directions in this area. DeTEXT is publicly available for downloading at http://prir.ustb.edu.cn/DeTEXT/.

@article{zheng_methods_2015,
title = {Methods for {Linking} {EHR} {Notes} to {Education} {Materials}},
volume = {2015},
issn = {2153-4063},
abstract = {It has been shown that providing patients with access to their own electronic health records (EHR) can enhance their medical understanding and provide clinically relevant benefits. However, languages that are difficult for non-medical professionals to comprehend are prevalent in the EHR notes, including medical terms, abbreviations, and domain-specific language patterns. Furthermore, limited average health literacy forms a barrier for patients to understand their health condition, impeding their ability to actively participate in managing their health. Therefore, we are developing a system to retrieve EHR note-tailored online consumer-oriented health education materials to improve patients' health knowledge of their own clinical conditions. Our experiments show that queries combining key concepts and other medical concepts present in the EHR notes significantly outperform (more than doubled) a baseline system of using the phrases from topic models.},
language = {eng},
journal = {AMIA Joint Summits on Translational Science proceedings AMIA Summit on Translational Science},
author = {Zheng, Jiaping and Yu, Hong},
year = {2015},
pmid = {26306273},
pmcid = {PMC4525231},
pages = {209--215}
}

It has been shown that providing patients with access to their own electronic health records (EHR) can enhance their medical understanding and provide clinically relevant benefits. However, languages that are difficult for non-medical professionals to comprehend are prevalent in the EHR notes, including medical terms, abbreviations, and domain-specific language patterns. Furthermore, limited average health literacy forms a barrier for patients to understand their health condition, impeding their ability to actively participate in managing their health. Therefore, we are developing a system to retrieve EHR note-tailored online consumer-oriented health education materials to improve patients' health knowledge of their own clinical conditions. Our experiments show that queries combining key concepts and other medical concepts present in the EHR notes significantly outperform (more than doubled) a baseline system of using the phrases from topic models.

@inproceedings{yu_towards_2015,
title = {Towards {Mining} {Electronic} {Health} {Records} for {Opioid} {ADE} {Surveillance}},
url = {http://www.hsrd.research.va.gov/meetings/2015/abstract-display.cfm?RecordID=200},
abstract = {Objectives:
Prescription opioids are commonly used to treat acute and cancer-related pain, and, over the last two decades, have increasingly been used in the management of chronic non-cancer pain. Patients taking opioids can experience a wide range of adverse drug events (ADEs), including constipation, nausea/vomiting, pruritus, drowsiness and dizziness, hormonal dysfunction, depression, oversedation, falls, fractures, addiction, overdose, respiratory depression, sleep-disordered breathing, and death. Since such ADEs are frequently described in the unstructured electronic health record (EHR) notes, we are developing natural language processing (NLP) system to automatically extract opioid and ADEs from EHRs. The purpose of this study was to test out the feasibility of mining EHR notes for ADE detection using NLP approaches.
Methods:
We developed an annotation guideline using an interactive process during which physicians and linguists worked together to define rules and resolve discrepancy. Following the guideline, two annotators annotated 150 discharge summaries (or 8,672 sentences comprising 102,807 word tokens). The overall pairwise annotation agreement was 88\%. The total number of annotated ADEs and medications were 103 and 3,290. Using this annotated corpus, we developed a NLP system to detect medication and ADE information. Our NLP system is trained on the supervised machine learning model Conditional Random Fields. We compared our NLP system with the state-of-the-art NLP system the MetaMap for ADE detection.
Results:
NLP performed well on discharge summaries on certain named entities, including frequency (92\% F1), route (89\% F1), dosage (87\% F1), and medication (84\% F1). Because the number of ADE instances is small, NLP performed poorly on ADE (24\% F1). MetaMap performed on average 62\% F1 for medication and 4\% F1 for ADE.
Implications:
Our NLP system outperformed MetaMap for EHR notes ADE detection. NLP generally performs well with a sufficient size of annotated data. While the performance of ADE detection is low, more annotated data yielding a higher prevalence of ADEs would likely improve opioid ADE detection. Use of larger datasets is underway.
Impacts:
NLP has the potential to improve understanding of the nature and prevalence of opioid ADEs and, ultimately, advance the field of medication safety.},
booktitle = {The 2015 {HSR}\&{D}/{QUERI} {National} {Conference}},
author = {Yu, H and Brandt, C and Becker, W and Kem, R},
year = {2015}
}

Objectives: Prescription opioids are commonly used to treat acute and cancer-related pain, and, over the last two decades, have increasingly been used in the management of chronic non-cancer pain. Patients taking opioids can experience a wide range of adverse drug events (ADEs), including constipation, nausea/vomiting, pruritus, drowsiness and dizziness, hormonal dysfunction, depression, oversedation, falls, fractures, addiction, overdose, respiratory depression, sleep-disordered breathing, and death. Since such ADEs are frequently described in the unstructured electronic health record (EHR) notes, we are developing natural language processing (NLP) system to automatically extract opioid and ADEs from EHRs. The purpose of this study was to test out the feasibility of mining EHR notes for ADE detection using NLP approaches. Methods: We developed an annotation guideline using an interactive process during which physicians and linguists worked together to define rules and resolve discrepancy. Following the guideline, two annotators annotated 150 discharge summaries (or 8,672 sentences comprising 102,807 word tokens). The overall pairwise annotation agreement was 88%. The total number of annotated ADEs and medications were 103 and 3,290. Using this annotated corpus, we developed a NLP system to detect medication and ADE information. Our NLP system is trained on the supervised machine learning model Conditional Random Fields. We compared our NLP system with the state-of-the-art NLP system the MetaMap for ADE detection. Results: NLP performed well on discharge summaries on certain named entities, including frequency (92% F1), route (89% F1), dosage (87% F1), and medication (84% F1). Because the number of ADE instances is small, NLP performed poorly on ADE (24% F1). MetaMap performed on average 62% F1 for medication and 4% F1 for ADE. Implications: Our NLP system outperformed MetaMap for EHR notes ADE detection. NLP generally performs well with a sufficient size of annotated data. While the performance of ADE detection is low, more annotated data yielding a higher prevalence of ADEs would likely improve opioid ADE detection. Use of larger datasets is underway. Impacts: NLP has the potential to improve understanding of the nature and prevalence of opioid ADEs and, ultimately, advance the field of medication safety.

@inproceedings{yu_systems_2015,
title = {Systems for helping {Veterans} {Comprehend} their own {EHR} notes.},
url = {http://www.hsrd.research.va.gov/meetings/2015/abstract-display.cfm?RecordID=149},
abstract = {Objectives:
In January 2013, the VHA began to make clinical notes in the electronic health record (EHR) available through the My HealtheVet portal. However, studies have shown that patients are confused by EHR notes, especially patients in vulnerable groups (e.g., low literacy, low income). Such confusion may result in unintended increases in service utilization and lead to changes in perceptions that may disrupt patient-provider relationships. We are developing NoteAid, a multi-component, natural language processing (NLP) system that translates medical jargon into lay terms and provides definitions and links to related educational material from trusted resources. Here we report the development and evaluation of NoteAid.
Methods:
NoteAid has two main components: a knowledge resource comprised of patient education materials and the NLP system that links EHR notes to the knowledge resource. The knowledge resource integrates the medical jargon-lay term-definitions from the Unified Medical Language System and the MedlinePlus. The NLP system links the complex medical jargon that appear in the note to simple consumer oriented definitions and explanations from the knowledge resource. We evaluated NoteAid using 40 de-identified EHR notes (20 progress notes and 20 discharge summaries). We recruited 64 subjects. Each subject reads an assigned EHR note before and after NoteAid and self-scores comprehension (on a scale of 1 to 5, with 1 the poorest and 5 the best comprehension). Each subject completes the evaluation of either 20 PGNs or 20 DSs. We used Flesch-Kincaid grade level (FKGL) to score readability of each note.
Results:
Our results show a negative association between FKGL of a note and its subject's self-reported comprehension score (Spearman rho coefficient, -0.807; p {\textless} 0.0001). NoteAid improves self-reported EHR comprehension in both PGNs and DSs and the improvement is statistically significant (Wilcoxon signed-rank test, p {\textless} 0.05).
Implications:
NoteAid improves EHR note comprehension.
Impacts:
We are preparing to implement and evaluate NoteAid within the VA. NoteAid has the potential to facilitate translation of clinical records to patients, which could enhance patient engagement and lead to improved self-management and clinical outcomes.},
booktitle = {2015 {HSR}\&{D}/{QUERI} {National} {Conference}},
author = {Yu, H and Brandt, C and Houston, T},
year = {2015}
}

Objectives: In January 2013, the VHA began to make clinical notes in the electronic health record (EHR) available through the My HealtheVet portal. However, studies have shown that patients are confused by EHR notes, especially patients in vulnerable groups (e.g., low literacy, low income). Such confusion may result in unintended increases in service utilization and lead to changes in perceptions that may disrupt patient-provider relationships. We are developing NoteAid, a multi-component, natural language processing (NLP) system that translates medical jargon into lay terms and provides definitions and links to related educational material from trusted resources. Here we report the development and evaluation of NoteAid. Methods: NoteAid has two main components: a knowledge resource comprised of patient education materials and the NLP system that links EHR notes to the knowledge resource. The knowledge resource integrates the medical jargon-lay term-definitions from the Unified Medical Language System and the MedlinePlus. The NLP system links the complex medical jargon that appear in the note to simple consumer oriented definitions and explanations from the knowledge resource. We evaluated NoteAid using 40 de-identified EHR notes (20 progress notes and 20 discharge summaries). We recruited 64 subjects. Each subject reads an assigned EHR note before and after NoteAid and self-scores comprehension (on a scale of 1 to 5, with 1 the poorest and 5 the best comprehension). Each subject completes the evaluation of either 20 PGNs or 20 DSs. We used Flesch-Kincaid grade level (FKGL) to score readability of each note. Results: Our results show a negative association between FKGL of a note and its subject's self-reported comprehension score (Spearman rho coefficient, -0.807; p \textless 0.0001). NoteAid improves self-reported EHR comprehension in both PGNs and DSs and the improvement is statistically significant (Wilcoxon signed-rank test, p \textless 0.05). Implications: NoteAid improves EHR note comprehension. Impacts: We are preparing to implement and evaluate NoteAid within the VA. NoteAid has the potential to facilitate translation of clinical records to patients, which could enhance patient engagement and lead to improved self-management and clinical outcomes.

@article{liu_learning_2014,
title = {Learning to {Rank} {Figures} within a {Biomedical} {Article}},
volume = {9},
issn = {1932-6203},
url = {http://dx.plos.org/10.1371/journal.pone.0061567},
doi = {10.1371/journal.pone.0061567},
abstract = {Hundreds of millions of figures are available in biomedical literature, representing important biomedical experimental evidence. This ever-increasing sheer volume has made it difficult for scientists to effectively and accurately access figures of their interest, the process of which is crucial for validating research facts and for formulating or testing novel research hypotheses. Current figure search applications can't fully meet this challenge as the "bag of figures" assumption doesn't take into account the relationship among figures. In our previous study, hundreds of biomedical researchers have annotated articles in which they serve as corresponding authors. They ranked each figure in their paper based on a figure's importance at their discretion, referred to as "figure ranking". Using this collection of annotated data, we investigated computational approaches to automatically rank figures. We exploited and extended the state-of-the-art listwise learning-to-rank algorithms and developed a new supervised-learning model BioFigRank. The cross-validation results show that BioFigRank yielded the best performance compared with other state-of-the-art computational models, and the greedy feature selection can further boost the ranking performance significantly. Furthermore, we carry out the evaluation by comparing BioFigRank with three-level competitive domain-specific human experts: (1) First Author, (2) Non-Author-In-Domain-Expert who is not the author nor co-author of an article but who works in the same field of the corresponding author of the article, and (3) Non-Author-Out-Domain-Expert who is not the author nor co-author of an article and who may or may not work in the same field of the corresponding author of an article. Our results show that BioFigRank outperforms Non-Author-Out-Domain-Expert and performs as well as Non-Author-In-Domain-Expert. Although BioFigRank underperforms First Author, since most biomedical researchers are either in- or out-domain-experts for an article, we conclude that BioFigRank represents an artificial intelligence system that offers expert-level intelligence to help biomedical researchers to navigate increasingly proliferated big data efficiently.},
language = {en},
number = {3},
urldate = {2015-02-26},
journal = {PLoS ONE},
author = {Liu, Feifan and Yu, Hong},
editor = {Preis, Tobias},
month = mar,
year = {2014},
pmid = {24625719 PMCID: PMC3953065},
pages = {e61567}
}

Hundreds of millions of figures are available in biomedical literature, representing important biomedical experimental evidence. This ever-increasing sheer volume has made it difficult for scientists to effectively and accurately access figures of their interest, the process of which is crucial for validating research facts and for formulating or testing novel research hypotheses. Current figure search applications can't fully meet this challenge as the "bag of figures" assumption doesn't take into account the relationship among figures. In our previous study, hundreds of biomedical researchers have annotated articles in which they serve as corresponding authors. They ranked each figure in their paper based on a figure's importance at their discretion, referred to as "figure ranking". Using this collection of annotated data, we investigated computational approaches to automatically rank figures. We exploited and extended the state-of-the-art listwise learning-to-rank algorithms and developed a new supervised-learning model BioFigRank. The cross-validation results show that BioFigRank yielded the best performance compared with other state-of-the-art computational models, and the greedy feature selection can further boost the ranking performance significantly. Furthermore, we carry out the evaluation by comparing BioFigRank with three-level competitive domain-specific human experts: (1) First Author, (2) Non-Author-In-Domain-Expert who is not the author nor co-author of an article but who works in the same field of the corresponding author of the article, and (3) Non-Author-Out-Domain-Expert who is not the author nor co-author of an article and who may or may not work in the same field of the corresponding author of an article. Our results show that BioFigRank outperforms Non-Author-Out-Domain-Expert and performs as well as Non-Author-In-Domain-Expert. Although BioFigRank underperforms First Author, since most biomedical researchers are either in- or out-domain-experts for an article, we conclude that BioFigRank represents an artificial intelligence system that offers expert-level intelligence to help biomedical researchers to navigate increasingly proliferated big data efficiently.

@article{zhang_computational_2014,
title = {Computational {Approaches} for {Predicting} {Biomedical} {Research} {Collaborations}},
volume = {9},
issn = {1932-6203},
url = {http://dx.plos.org/10.1371/journal.pone.0111795},
doi = {10.1371/journal.pone.0111795},
abstract = {Biomedical research is increasingly collaborative, and successful collaborations often produce high impact work. Computational approaches can be developed for automatically predicting biomedical research collaborations. Previous works of collaboration prediction mainly explored the topological structures of research collaboration networks, leaving out rich semantic information from the publications themselves. In this paper, we propose supervised machine learning approaches to predict research collaborations in the biomedical field. We explored both the semantic features extracted from author research interest profile and the author network topological features. We found that the most informative semantic features for author collaborations are related to research interest, including similarity of out-citing citations, similarity of abstracts. Of the four supervised machine learning models (naïve Bayes, naïve Bayes multinomial, SVMs, and logistic regression), the best performing model is logistic regression with an ROC ranging from 0.766 to 0.980 on different datasets. To our knowledge we are the first to study in depth how research interest and productivities can be used for collaboration prediction. Our approach is computationally efficient, scalable and yet simple to implement. The datasets of this study are available at https://github.com/qingzhanggithub/medline-collaboration-datasets.},
language = {en},
number = {11},
urldate = {2015-02-26},
journal = {PLoS ONE},
author = {Zhang, Qing and Yu, Hong},
editor = {Smalheiser, Neil R.},
month = nov,
year = {2014},
pmid = {25375164 PMCID: PMC4222920},
pages = {e111795}
}

Biomedical research is increasingly collaborative, and successful collaborations often produce high impact work. Computational approaches can be developed for automatically predicting biomedical research collaborations. Previous works of collaboration prediction mainly explored the topological structures of research collaboration networks, leaving out rich semantic information from the publications themselves. In this paper, we propose supervised machine learning approaches to predict research collaborations in the biomedical field. We explored both the semantic features extracted from author research interest profile and the author network topological features. We found that the most informative semantic features for author collaborations are related to research interest, including similarity of out-citing citations, similarity of abstracts. Of the four supervised machine learning models (naïve Bayes, naïve Bayes multinomial, SVMs, and logistic regression), the best performing model is logistic regression with an ROC ranging from 0.766 to 0.980 on different datasets. To our knowledge we are the first to study in depth how research interest and productivities can be used for collaboration prediction. Our approach is computationally efficient, scalable and yet simple to implement. The datasets of this study are available at https://github.com/qingzhanggithub/medline-collaboration-datasets.

@article{li_robust_2014,
title = {A robust data-driven approach for gene ontology annotation},
volume = {2014},
issn = {1758-0463},
url = {http://database.oxfordjournals.org/cgi/doi/10.1093/database/bau113},
doi = {10.1093/database/bau113},
abstract = {Gene ontology (GO) and GO annotation are important resources for biological information management and knowledge discovery, but the speed of manual annotation became a major bottleneck of database curation. BioCreative IV GO annotation task aims to evaluate the performance of system that automatically assigns GO terms to genes based on the narrative sentences in biomedical literature. This article presents our work in this task as well as the experimental results after the competition. For the evidence sentence extraction subtask, we built a binary classifier to identify evidence sentences using reference distance estimator (RDE), a recently proposed semi-supervised learning method that learns new features from around 10 million unlabeled sentences, achieving an F1 of 19.3\% in exact match and 32.5\% in relaxed match. In the post-submission experiment, we obtained 22.1\% and 35.7\% F1 performance by incorporating bigram features in RDE learning. In both development and test sets, RDE-based method achieved over 20\% relative improvement on F1 and AUC performance against classical supervised learning methods, e.g. support vector machine and logistic regression. For the GO term prediction subtask, we developed an information retrieval-based method to retrieve the GO term most relevant to each evidence sentence using a ranking function that combined cosine similarity and the frequency of GO terms in documents, and a filtering method based on high-level GO classes. The best performance of our submitted runs was 7.8\% F1 and 22.2\% hierarchy F1. We found that the incorporation of frequency information and hierarchy filtering substantially improved the performance. In the post-submission evaluation, we obtained a 10.6\% F1 using a simpler setting. Overall, the experimental analysis showed our approaches were robust in both the two tasks.},
language = {eng},
journal = {Database: The Journal of Biological Databases and Curation},
author = {Li, Yanpeng and Yu, Hong},
year = {2014},
pmid = {25425037},
pmcid = {PMC4243380},
note = {00000 },
pages = {bau113}
}

Gene ontology (GO) and GO annotation are important resources for biological information management and knowledge discovery, but the speed of manual annotation became a major bottleneck of database curation. BioCreative IV GO annotation task aims to evaluate the performance of system that automatically assigns GO terms to genes based on the narrative sentences in biomedical literature. This article presents our work in this task as well as the experimental results after the competition. For the evidence sentence extraction subtask, we built a binary classifier to identify evidence sentences using reference distance estimator (RDE), a recently proposed semi-supervised learning method that learns new features from around 10 million unlabeled sentences, achieving an F1 of 19.3% in exact match and 32.5% in relaxed match. In the post-submission experiment, we obtained 22.1% and 35.7% F1 performance by incorporating bigram features in RDE learning. In both development and test sets, RDE-based method achieved over 20% relative improvement on F1 and AUC performance against classical supervised learning methods, e.g. support vector machine and logistic regression. For the GO term prediction subtask, we developed an information retrieval-based method to retrieve the GO term most relevant to each evidence sentence using a ranking function that combined cosine similarity and the frequency of GO terms in documents, and a filtering method based on high-level GO classes. The best performance of our submitted runs was 7.8% F1 and 22.2% hierarchy F1. We found that the incorporation of frequency information and hierarchy filtering substantially improved the performance. In the post-submission evaluation, we obtained a 10.6% F1 using a simpler setting. Overall, the experimental analysis showed our approaches were robust in both the two tasks.

Allowing patients access to their physicians’ notes has the potential to enhance their understanding of disease and improve medication adherence and healthcare outcomes. However, a recent study involving over ten thousand patients showed that allowing patients to read their electronic health record (EHR) notes caused confusion, especially for the vulnerable (e.g., lower literacy, lower income) groups. This finding is not surprising as EHR notes contain medical jargon that may be difficult for patients to comprehend. To improve patients’ EHR note comprehension, we are developing a biomedical natural language processing system called NoteAid (http://clinicalnotesaid.org), which translates medical jargon into consumer-oriented lay language. The current NoteAid implementations link EHR medical terms to their definitions and other related educational material. Our evaluation has shown that all NoteAid implementations improve self-rated EHR note comprehension by 23% to 40% of lay people.

This paper details the development and implementation of CiteGraph, a system for constructing large-scale citation and co-authorship networks from full-text biomedical articles. CiteGraph represents articles and authors by uniquely identified nodes, and connects those nodes through citation and co-authorship relations. CiteGraph network encompasses over 1.65 million full-text articles and 6.35 million citations by 1.37 million unique authors from the Elsevier full-text articles. Our evaluation shows 98% 99% F1-score for mapping a citation to the corresponding article and identifying MEDLINE articles. We further analyzed the characteristics of CiteGraph and found that they are consistent with assumptions made using small-scale bibliometric analysis. We also developed several novel network-based methods for analyzing publication, citation and collaboration patterns. This is the first work to develop a completely automated system for the creation of a large-scale citation network in the biomedical domain, and also to introduce novel findings in researcher publication histories. CiteGraph can be a useful resource to both the biomedical community, and bibliometric research.

In mobile health (M-health), Short Message Service (SMS) has shown to improve disease related self-management and health service outcomes, leading to enhanced patient care. However, the hard limit on character size for each message limits the full value of exploring SMS communication in health care practices. To overcome this problem and improve the efficiency of clinical workflow, we developed an innovative system, MedTxting (available at http://medtxting.askhermes.org), which is a learning-based but knowledge-rich system that compresses medical texts in a SMS style. Evaluations on clinical questions and discharge summary narratives show that MedTxting can effectively compress medical texts with reasonable readability and noticeable size reduction. Findings in this work reveal potentials of MedTxting to the clinical settings, allowing for real-time and cost-effective communication, such as patient condition reporting, medication consulting, physicians connecting to share expertise to improve point of care.

@article{cao_askhermes_2011,
title = {{AskHERMES}: {An} online question answering system for complex clinical questions},
volume = {44},
issn = {1532-0480},
shorttitle = {{AskHERMES}},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21256977},
doi = {10.1016/j.jbi.2011.01.004},
abstract = {{\textless}AbstractText Label="OBJECTIVE" NlmCategory="OBJECTIVE"{\textgreater}Clinical questions are often long and complex and take many forms. We have built a clinical question answering system named AskHERMES to perform robust semantic analysis on complex clinical questions and output question-focused extractive summaries as answers.{\textless}/AbstractText{\textgreater}
{\textless}AbstractText Label="DESIGN" NlmCategory="METHODS"{\textgreater}This paper describes the system architecture and a preliminary evaluation of AskHERMES, which implements innovative approaches in question analysis, summarization, and answer presentation. Five types of resources were indexed in this system: MEDLINE abstracts, PubMed Central full-text articles, eMedicine documents, clinical guidelines and Wikipedia articles.{\textless}/AbstractText{\textgreater}
{\textless}AbstractText Label="MEASUREMENT" NlmCategory="METHODS"{\textgreater}We compared the AskHERMES system with Google (Google and Google Scholar) and UpToDate and asked physicians to score the three systems by ease of use, quality of answer, time spent, and overall performance.{\textless}/AbstractText{\textgreater}
{\textless}AbstractText Label="RESULTS" NlmCategory="RESULTS"{\textgreater}AskHERMES allows physicians to enter a question in a natural way with minimal query formulation and allows physicians to efficiently navigate among all the answer sentences to quickly meet their information needs. In contrast, physicians need to formulate queries to search for information in Google and UpToDate. The development of the AskHERMES system is still at an early stage, and the knowledge resource is limited compared with Google or UpToDate. Nevertheless, the evaluation results show that AskHERMES' performance is comparable to the other systems. In particular, when answering complex clinical questions, it demonstrates the potential to outperform both Google and UpToDate systems.{\textless}/AbstractText{\textgreater}
{\textless}AbstractText Label="CONCLUSIONS" NlmCategory="CONCLUSIONS"{\textgreater}AskHERMES, available at http://www.AskHERMES.org, has the potential to help physicians practice evidence-based medicine and improve the quality of patient care.{\textless}/AbstractText{\textgreater}},
number = {2},
urldate = {2011-03-25},
journal = {Journal of Biomedical Informatics},
author = {Cao, Yonggang and Liu, Feifan and Simpson, Pippa and Antieau, Lamont and Bennett, Andrew and Cimino, James J and Ely, John and Yu, Hong},
month = apr,
year = {2011},
pmid = {21256977 PMCID: PMC3433744},
keywords = {Algorithms, Clinical Medicine, Databases, Factual, Information Storage and Retrieval, Online Systems, Software, expert systems, natural language processing},
pages = {277--288}
}

\textlessAbstractText Label="OBJECTIVE" NlmCategory="OBJECTIVE"\textgreaterClinical questions are often long and complex and take many forms. We have built a clinical question answering system named AskHERMES to perform robust semantic analysis on complex clinical questions and output question-focused extractive summaries as answers.\textless/AbstractText\textgreater \textlessAbstractText Label="DESIGN" NlmCategory="METHODS"\textgreaterThis paper describes the system architecture and a preliminary evaluation of AskHERMES, which implements innovative approaches in question analysis, summarization, and answer presentation. Five types of resources were indexed in this system: MEDLINE abstracts, PubMed Central full-text articles, eMedicine documents, clinical guidelines and Wikipedia articles.\textless/AbstractText\textgreater \textlessAbstractText Label="MEASUREMENT" NlmCategory="METHODS"\textgreaterWe compared the AskHERMES system with Google (Google and Google Scholar) and UpToDate and asked physicians to score the three systems by ease of use, quality of answer, time spent, and overall performance.\textless/AbstractText\textgreater \textlessAbstractText Label="RESULTS" NlmCategory="RESULTS"\textgreaterAskHERMES allows physicians to enter a question in a natural way with minimal query formulation and allows physicians to efficiently navigate among all the answer sentences to quickly meet their information needs. In contrast, physicians need to formulate queries to search for information in Google and UpToDate. The development of the AskHERMES system is still at an early stage, and the knowledge resource is limited compared with Google or UpToDate. Nevertheless, the evaluation results show that AskHERMES' performance is comparable to the other systems. In particular, when answering complex clinical questions, it demonstrates the potential to outperform both Google and UpToDate systems.\textless/AbstractText\textgreater \textlessAbstractText Label="CONCLUSIONS" NlmCategory="CONCLUSIONS"\textgreaterAskHERMES, available at http://www.AskHERMES.org, has the potential to help physicians practice evidence-based medicine and improve the quality of patient care.\textless/AbstractText\textgreater

@article{liu_toward_2011,
title = {Toward automated consumer question answering: {Automatically} separating consumer questions from professional questions in the healthcare domain},
volume = {44},
issn = {15320464},
shorttitle = {Toward automated consumer question answering},
url = {http://linkinghub.elsevier.com/retrieve/pii/S1532046411001353},
doi = {10.1016/j.jbi.2011.08.008},
abstract = {OBJECTIVE:
Both healthcare professionals and healthcare consumers have information needs that can be met through the use of computers, specifically via medical question answering systems. However, the information needs of both groups are different in terms of literacy levels and technical expertise, and an effective question answering system must be able to account for these differences if it is to formulate the most relevant responses for users from each group. In this paper, we propose that a first step toward answering the queries of different users is automatically classifying questions according to whether they were asked by healthcare professionals or consumers.
DESIGN:
We obtained two sets of consumer questions ({\textasciitilde}10,000 questions in total) from Yahoo answers. The professional questions consist of two question collections: 4654 point-of-care questions (denoted as PointCare) obtained from interviews of a group of family doctors following patient visits and 5378 questions from physician practices through professional online services (denoted as OnlinePractice). With more than 20,000 questions combined, we developed supervised machine-learning models for automatic classification between consumer questions and professional questions. To evaluate the robustness of our models, we tested the model that was trained on the Consumer-PointCare dataset on the Consumer-OnlinePractice dataset. We evaluated both linguistic features and statistical features and examined how the characteristics in two different types of professional questions (PointCare vs. OnlinePractice) may affect the classification performance. We explored information gain for feature reduction and the back-off linguistic category features.
RESULTS:
The 10-fold cross-validation results showed the best F1-measure of 0.936 and 0.946 on Consumer-PointCare and Consumer-OnlinePractice respectively, and the best F1-measure of 0.891 when testing the Consumer-PointCare model on the Consumer-OnlinePractice dataset.
CONCLUSION:
Healthcare consumer questions posted at Yahoo online communities can be reliably classified from professional questions posted by point-of-care clinicians and online physicians. The supervised machine-learning models are robust for this task. Our study will significantly benefit further development in automated consumer question answering.},
language = {en},
number = {6},
urldate = {2016-11-30},
journal = {Journal of Biomedical Informatics},
author = {Liu, Feifan and Antieau, Lamont D. and Yu, Hong},
month = dec,
year = {2011},
pmid = {21856442 PMCID: PMC3226885},
keywords = {Artificial Intelligence, Consumer Participation, Databases, Factual, Delivery of Health Care, Humans, Information Dissemination, Information Storage and Retrieval, Internet, Point-of-Care Systems, Semantics, natural language processing},
pages = {1032--1038}
}

OBJECTIVE: Both healthcare professionals and healthcare consumers have information needs that can be met through the use of computers, specifically via medical question answering systems. However, the information needs of both groups are different in terms of literacy levels and technical expertise, and an effective question answering system must be able to account for these differences if it is to formulate the most relevant responses for users from each group. In this paper, we propose that a first step toward answering the queries of different users is automatically classifying questions according to whether they were asked by healthcare professionals or consumers. DESIGN: We obtained two sets of consumer questions (\textasciitilde10,000 questions in total) from Yahoo answers. The professional questions consist of two question collections: 4654 point-of-care questions (denoted as PointCare) obtained from interviews of a group of family doctors following patient visits and 5378 questions from physician practices through professional online services (denoted as OnlinePractice). With more than 20,000 questions combined, we developed supervised machine-learning models for automatic classification between consumer questions and professional questions. To evaluate the robustness of our models, we tested the model that was trained on the Consumer-PointCare dataset on the Consumer-OnlinePractice dataset. We evaluated both linguistic features and statistical features and examined how the characteristics in two different types of professional questions (PointCare vs. OnlinePractice) may affect the classification performance. We explored information gain for feature reduction and the back-off linguistic category features. RESULTS: The 10-fold cross-validation results showed the best F1-measure of 0.936 and 0.946 on Consumer-PointCare and Consumer-OnlinePractice respectively, and the best F1-measure of 0.891 when testing the Consumer-PointCare model on the Consumer-OnlinePractice dataset. CONCLUSION: Healthcare consumer questions posted at Yahoo online communities can be reliably classified from professional questions posted by point-of-care clinicians and online physicians. The supervised machine-learning models are robust for this task. Our study will significantly benefit further development in automated consumer question answering.

@article{agarwal_simple_2011,
title = {Simple and efficient machine learning frameworks for identifying protein-protein interaction relevant articles and experimental methods used to study the interactions},
volume = {12},
issn = {1471-2105},
url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-S8-S10},
doi = {10.1186/1471-2105-12-S8-S10},
abstract = {BACKGROUND:
Protein-protein interaction (PPI) is an important biomedical phenomenon. Automatically detecting PPI-relevant articles and identifying methods that are used to study PPI are important text mining tasks. In this study, we have explored domain independent features to develop two open source machine learning frameworks. One performs binary classification to determine whether the given article is PPI relevant or not, named "Simple Classifier", and the other one maps the PPI relevant articles with corresponding interaction method nodes in a standardized PSI-MI (Proteomics Standards Initiative-Molecular Interactions) ontology, named "OntoNorm".
RESULTS:
We evaluated our system in the context of BioCreative challenge competition using the standardized data set. Our systems are amongst the top systems reported by the organizers, attaining 60.8\% F1-score for identifying relevant documents, and 52.3\% F1-score for mapping articles to interaction method ontology.
CONCLUSION:
Our results show that domain-independent machine learning frameworks can perform competitively well at the tasks of detecting PPI relevant articles and identifying the methods that were used to study the interaction in such articles.},
language = {en},
number = {Suppl 8},
urldate = {2016-11-30},
journal = {BMC Bioinformatics},
author = {Agarwal, Shashank and Liu, Feifan and Yu, Hong},
year = {2011},
pmid = {22151701 PMCID: PMC3269933},
pages = {S10}
}

BACKGROUND: Protein-protein interaction (PPI) is an important biomedical phenomenon. Automatically detecting PPI-relevant articles and identifying methods that are used to study PPI are important text mining tasks. In this study, we have explored domain independent features to develop two open source machine learning frameworks. One performs binary classification to determine whether the given article is PPI relevant or not, named "Simple Classifier", and the other one maps the PPI relevant articles with corresponding interaction method nodes in a standardized PSI-MI (Proteomics Standards Initiative-Molecular Interactions) ontology, named "OntoNorm". RESULTS: We evaluated our system in the context of BioCreative challenge competition using the standardized data set. Our systems are amongst the top systems reported by the organizers, attaining 60.8% F1-score for identifying relevant documents, and 52.3% F1-score for mapping articles to interaction method ontology. CONCLUSION: Our results show that domain-independent machine learning frameworks can perform competitively well at the tasks of detecting PPI relevant articles and identifying the methods that were used to study the interaction in such articles.

@article{zhang_parsing_2011,
title = {Parsing citations in biomedical articles using conditional random fields},
volume = {41},
issn = {00104825},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0010482511000291},
doi = {10.1016/j.compbiomed.2011.02.005},
abstract = {Citations are used ubiquitously in biomedical full-text articles and play an important role for representing both the rhetorical structure and the semantic content of the articles. As a result, text mining systems will significantly benefit from a tool that automatically extracts the content of a citation. In this study, we applied the supervised machine-learning algorithms Conditional Random Fields (CRFs) to automatically parse a citation into its fields (e.g., Author, Title, Journal, and Year). With a subset of html format open-access PubMed Central articles, we report an overall 97.95\% F1-score. The citation parser can be accessed at: http://www.cs.uwm.edu/∼qing/projects/cithit/index.html.},
language = {en},
number = {4},
urldate = {2016-11-30},
journal = {Computers in Biology and Medicine},
author = {Zhang, Qing and Cao, Yong-Gang and Yu, Hong},
month = apr,
year = {2011},
pmid = {21419403 PMCID: PMC3086470},
pages = {190--194}
}

Citations are used ubiquitously in biomedical full-text articles and play an important role for representing both the rhetorical structure and the semantic content of the articles. As a result, text mining systems will significantly benefit from a tool that automatically extracts the content of a citation. In this study, we applied the supervised machine-learning algorithms Conditional Random Fields (CRFs) to automatically parse a citation into its fields (e.g., Author, Title, Journal, and Year). With a subset of html format open-access PubMed Central articles, we report an overall 97.95% F1-score. The citation parser can be accessed at: http://www.cs.uwm.edu/∼qing/projects/cithit/index.html.

@article{miller_investigation_2011,
title = {An investigation into the feasibility of spoken clinical question answering},
volume = {2011},
issn = {1942-597X},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3243288/},
abstract = {Spoken question answering for clinical decision support is a potentially revolutionary technology for improving the efficiency and quality of health care delivery. This application involves many technologies currently being researched, including automatic speech recognition (ASR), information retrieval (IR), and summarization, all in the biomedical domain. In certain domains, the problem of spoken document retrieval has been declared solved because of the robustness of IR to ASR errors. This study investigates the extent to which spoken medical question answering benefits from that same robustness. We used the best results from previous speech recognition experiments as inputs to a clinical question answering system, and had physicians perform blind evaluations of results generated both by ASR transcripts of questions and gold standard transcripts of the same questions. Our results suggest that the medical domain differs enough from the open domain to require additional work in automatic speech recognition adapted for the biomedical domain.},
language = {ENG},
journal = {AMIA ... Annual Symposium proceedings. AMIA Symposium},
author = {Miller, Tim and Ravvaz, Kourosh and Cimino, James J. and Yu, Hong},
year = {2011},
pmid = {22195154},
pmcid = {PMC3243288},
keywords = {Decision Support Systems, Clinical, Feasibility Studies, Humans, Information Storage and Retrieval, Speech Recognition Software, natural language processing},
pages = {954--959}
}

Spoken question answering for clinical decision support is a potentially revolutionary technology for improving the efficiency and quality of health care delivery. This application involves many technologies currently being researched, including automatic speech recognition (ASR), information retrieval (IR), and summarization, all in the biomedical domain. In certain domains, the problem of spoken document retrieval has been declared solved because of the robustness of IR to ASR errors. This study investigates the extent to which spoken medical question answering benefits from that same robustness. We used the best results from previous speech recognition experiments as inputs to a clinical question answering system, and had physicians perform blind evaluations of results generated both by ASR transcripts of questions and gold standard transcripts of the same questions. Our results suggest that the medical domain differs enough from the open domain to require additional work in automatic speech recognition adapted for the biomedical domain.

@article{granger_apixaban_2011,
title = {Apixaban versus warfarin in patients with atrial fibrillation},
volume = {365},
issn = {1533-4406},
url = {http://www.nejm.org/doi/full/10.1056/NEJMoa1107039},
doi = {10.1056/NEJMoa1107039},
abstract = {BACKGROUND: Vitamin K antagonists are highly effective in preventing stroke in patients with atrial fibrillation but have several limitations. Apixaban is a novel oral direct factor Xa inhibitor that has been shown to reduce the risk of stroke in a similar population in comparison with aspirin.
METHODS: In this randomized, double-blind trial, we compared apixaban (at a dose of 5 mg twice daily) with warfarin (target international normalized ratio, 2.0 to 3.0) in 18,201 patients with atrial fibrillation and at least one additional risk factor for stroke. The primary outcome was ischemic or hemorrhagic stroke or systemic embolism. The trial was designed to test for noninferiority, with key secondary objectives of testing for superiority with respect to the primary outcome and to the rates of major bleeding and death from any cause.
RESULTS: The median duration of follow-up was 1.8 years. The rate of the primary outcome was 1.27\% per year in the apixaban group, as compared with 1.60\% per year in the warfarin group (hazard ratio with apixaban, 0.79; 95\% confidence interval [CI], 0.66 to 0.95; P{\textless}0.001 for noninferiority; P=0.01 for superiority). The rate of major bleeding was 2.13\% per year in the apixaban group, as compared with 3.09\% per year in the warfarin group (hazard ratio, 0.69; 95\% CI, 0.60 to 0.80; P{\textless}0.001), and the rates of death from any cause were 3.52\% and 3.94\%, respectively (hazard ratio, 0.89; 95\% CI, 0.80 to 0.99; P=0.047). The rate of hemorrhagic stroke was 0.24\% per year in the apixaban group, as compared with 0.47\% per year in the warfarin group (hazard ratio, 0.51; 95\% CI, 0.35 to 0.75; P{\textless}0.001), and the rate of ischemic or uncertain type of stroke was 0.97\% per year in the apixaban group and 1.05\% per year in the warfarin group (hazard ratio, 0.92; 95\% CI, 0.74 to 1.13; P=0.42).
CONCLUSIONS: In patients with atrial fibrillation, apixaban was superior to warfarin in preventing stroke or systemic embolism, caused less bleeding, and resulted in lower mortality. (Funded by Bristol-Myers Squibb and Pfizer; ARISTOTLE ClinicalTrials.gov number, NCT00412984.).},
language = {eng},
number = {11},
journal = {The New England Journal of Medicine},
author = {Granger, Christopher B. and Alexander, John H. and McMurray, John J. V. and Lopes, Renato D. and Hylek, Elaine M. and Hanna, Michael and Al-Khalidi, Hussein R. and Ansell, Jack and Atar, Dan and Avezum, Alvaro and Bahit, M. Cecilia and Diaz, Rafael and Easton, J. Donald and Ezekowitz, Justin A. and Flaker, Greg and Garcia, David and Geraldes, Margarida and Gersh, Bernard J. and Golitsyn, Sergey and Goto, Shinya and Hermosillo, Antonio G. and Hohnloser, Stefan H. and Horowitz, John and Mohan, Puneet and Jansky, Petr and Lewis, Basil S. and Lopez-Sendon, Jose Luis and Pais, Prem and Parkhomenko, Alexander and Verheugt, Freek W. A. and Zhu, Jun and Wallentin, Lars and {ARISTOTLE Committees and Investigators}},
month = sep,
year = {2011},
pmid = {21870978},
keywords = {Aged, Anticoagulants, Atrial Fibrillation, Double-Blind Method, Factor Xa Inhibitors, Female, Follow-Up Studies, Hemorrhage, Humans, International Normalized Ratio, Kaplan-Meier Estimate, Male, Middle Aged, Pyrazoles, Pyridones, Stroke, Thromboembolism, Treatment Outcome, Warfarin},
pages = {981--992}
}

BACKGROUND: Vitamin K antagonists are highly effective in preventing stroke in patients with atrial fibrillation but have several limitations. Apixaban is a novel oral direct factor Xa inhibitor that has been shown to reduce the risk of stroke in a similar population in comparison with aspirin. METHODS: In this randomized, double-blind trial, we compared apixaban (at a dose of 5 mg twice daily) with warfarin (target international normalized ratio, 2.0 to 3.0) in 18,201 patients with atrial fibrillation and at least one additional risk factor for stroke. The primary outcome was ischemic or hemorrhagic stroke or systemic embolism. The trial was designed to test for noninferiority, with key secondary objectives of testing for superiority with respect to the primary outcome and to the rates of major bleeding and death from any cause. RESULTS: The median duration of follow-up was 1.8 years. The rate of the primary outcome was 1.27% per year in the apixaban group, as compared with 1.60% per year in the warfarin group (hazard ratio with apixaban, 0.79; 95% confidence interval [CI], 0.66 to 0.95; P\textless0.001 for noninferiority; P=0.01 for superiority). The rate of major bleeding was 2.13% per year in the apixaban group, as compared with 3.09% per year in the warfarin group (hazard ratio, 0.69; 95% CI, 0.60 to 0.80; P\textless0.001), and the rates of death from any cause were 3.52% and 3.94%, respectively (hazard ratio, 0.89; 95% CI, 0.80 to 0.99; P=0.047). The rate of hemorrhagic stroke was 0.24% per year in the apixaban group, as compared with 0.47% per year in the warfarin group (hazard ratio, 0.51; 95% CI, 0.35 to 0.75; P\textless0.001), and the rate of ischemic or uncertain type of stroke was 0.97% per year in the apixaban group and 1.05% per year in the warfarin group (hazard ratio, 0.92; 95% CI, 0.74 to 1.13; P=0.42). CONCLUSIONS: In patients with atrial fibrillation, apixaban was superior to warfarin in preventing stroke or systemic embolism, caused less bleeding, and resulted in lower mortality. (Funded by Bristol-Myers Squibb and Pfizer; ARISTOTLE ClinicalTrials.gov number, NCT00412984.).

Identification of discourse relations, such as causal and contrastive relations, between situations mentioned in text is an important task for biomedical text-mining. A biomedical text corpus annotated with discourse relations would be very useful for developing and evaluating methods for biomedical discourse processing. However, little effort has been made to develop such an annotated resource.

@article{liu_towards_2011,
title = {Towards spoken clinical-question answering: evaluating and adapting automatic speech-recognition systems for spoken clinical questions},
volume = {18},
issn = {1527-974X},
shorttitle = {Towards spoken clinical-question answering},
url = {http://www.ncbi.nlm.nih.gov/pubmed/21705457},
doi = {10.1136/amiajnl-2010-000071},
abstract = {OBJECTIVE
To evaluate existing automatic speech-recognition (ASR) systems to measure their performance in interpreting spoken clinical questions and to adapt one ASR system to improve its performance on this task.
DESIGN AND MEASUREMENTS
The authors evaluated two well-known ASR systems on spoken clinical questions: Nuance Dragon (both generic and medical versions: Nuance Gen and Nuance Med) and the SRI Decipher (the generic version SRI Gen). The authors also explored language model adaptation using more than 4000 clinical questions to improve the SRI system's performance, and profile training to improve the performance of the Nuance Med system. The authors reported the results with the NIST standard word error rate (WER) and further analyzed error patterns at the semantic level.
RESULTS
Nuance Gen and Med systems resulted in a WER of 68.1\% and 67.4\% respectively. The SRI Gen system performed better, attaining a WER of 41.5\%. After domain adaptation with a language model, the performance of the SRI system improved 36\% to a final WER of 26.7\%.
CONCLUSION
Without modification, two well-known ASR systems do not perform well in interpreting spoken clinical questions. With a simple domain adaptation, one of the ASR systems improved significantly on the clinical question task, indicating the importance of developing domain/genre-specific ASR systems.},
number = {5},
urldate = {2011-12-13},
journal = {Journal of the American Medical Informatics Association: JAMIA},
author = {Liu, Feifan and Tur, Gokhan and Hakkani-Tür, Dilek and Yu, Hong},
month = oct,
year = {2011},
pmid = {21705457},
pages = {625--630}
}

OBJECTIVE To evaluate existing automatic speech-recognition (ASR) systems to measure their performance in interpreting spoken clinical questions and to adapt one ASR system to improve its performance on this task. DESIGN AND MEASUREMENTS The authors evaluated two well-known ASR systems on spoken clinical questions: Nuance Dragon (both generic and medical versions: Nuance Gen and Nuance Med) and the SRI Decipher (the generic version SRI Gen). The authors also explored language model adaptation using more than 4000 clinical questions to improve the SRI system's performance, and profile training to improve the performance of the Nuance Med system. The authors reported the results with the NIST standard word error rate (WER) and further analyzed error patterns at the semantic level. RESULTS Nuance Gen and Med systems resulted in a WER of 68.1% and 67.4% respectively. The SRI Gen system performed better, attaining a WER of 41.5%. After domain adaptation with a language model, the performance of the SRI system improved 36% to a final WER of 26.7%. CONCLUSION Without modification, two well-known ASR systems do not perform well in interpreting spoken clinical questions. With a simple domain adaptation, one of the ASR systems improved significantly on the clinical question task, indicating the importance of developing domain/genre-specific ASR systems.

@article{li_lancet_2010,
title = {Lancet: a high precision medication event extraction system for clinical text},
volume = {17},
issn = {1527-974X},
shorttitle = {Lancet},
url = {http://www.ncbi.nlm.nih.gov/pubmed/20819865},
doi = {10.1136/jamia.2010.004077},
abstract = {OBJECTIVE: This paper presents Lancet, a supervised machine-learning system that automatically extracts medication events consisting of medication names and information pertaining to their prescribed use (dosage, mode, frequency, duration and reason) from lists or narrative text in medical discharge summaries. DESIGN: Lancet incorporates three supervised machine-learning models: a conditional random fields model for tagging individual medication names and associated fields, an AdaBoost model with decision stump algorithm for determining which medication names and fields belong to a single medication event, and a support vector machines disambiguation model for identifying the context style (narrative or list). MEASUREMENTS: The authors, from the University of Wisconsin-Milwaukee, participated in the third i2b2 shared-task for challenges in natural language processing for clinical data: medication extraction challenge. With the performance metrics provided by the i2b2 challenge, the micro F1 (precision/recall) scores are reported for both the horizontal and vertical level. RESULTS: Among the top 10 teams, Lancet achieved the highest precision at 90.4\% with an overall F1 score of 76.4\% (horizontal system level with exact match), a gain of 11.2\% and 12\%, respectively, compared with the rule-based baseline system jMerki. By combining the two systems, the hybrid system further increased the F1 score by 3.4\% from 76.4\% to 79.0\%. CONCLUSIONS: Supervised machine-learning systems with minimal external knowledge resources can achieve a high precision with a competitive overall F1 score.Lancet based on this learning framework does not rely on expensive manually curated rules. The system is available online at http://code.google.com/p/lancet/.},
number = {5},
urldate = {2010-09-21},
journal = {Journal of the American Medical Informatics Association: JAMIA},
author = {Li, Zuofeng and Liu, Feifan and Antieau, Lamont and Cao, Yonggang and Yu, Hong},
month = oct,
year = {2010},
pmid = {20819865 PMCID: PMC2995682},
pages = {563--567}
}

OBJECTIVE: This paper presents Lancet, a supervised machine-learning system that automatically extracts medication events consisting of medication names and information pertaining to their prescribed use (dosage, mode, frequency, duration and reason) from lists or narrative text in medical discharge summaries. DESIGN: Lancet incorporates three supervised machine-learning models: a conditional random fields model for tagging individual medication names and associated fields, an AdaBoost model with decision stump algorithm for determining which medication names and fields belong to a single medication event, and a support vector machines disambiguation model for identifying the context style (narrative or list). MEASUREMENTS: The authors, from the University of Wisconsin-Milwaukee, participated in the third i2b2 shared-task for challenges in natural language processing for clinical data: medication extraction challenge. With the performance metrics provided by the i2b2 challenge, the micro F1 (precision/recall) scores are reported for both the horizontal and vertical level. RESULTS: Among the top 10 teams, Lancet achieved the highest precision at 90.4% with an overall F1 score of 76.4% (horizontal system level with exact match), a gain of 11.2% and 12%, respectively, compared with the rule-based baseline system jMerki. By combining the two systems, the hybrid system further increased the F1 score by 3.4% from 76.4% to 79.0%. CONCLUSIONS: Supervised machine-learning systems with minimal external knowledge resources can achieve a high precision with a competitive overall F1 score.Lancet based on this learning framework does not rely on expensive manually curated rules. The system is available online at http://code.google.com/p/lancet/.

@article{ramesh_identifying_2010,
title = {Identifying discourse connectives in biomedical text},
volume = {2010},
issn = {1942-597X},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3041460/},
abstract = {Discourse connectives are words or phrases that connect or relate two coherent sentences or phrases and indicate the presence of discourse relations. Automatic recognition of discourse connectives may benefit many natural language processing applications. In this pilot study, we report the development of the supervised machine-learning classifiers with conditional random fields (CRFs) for automatically identifying discourse connectives in full-text biomedical articles. Our first classifier was trained on the open-domain 1 million token Penn Discourse Tree Bank (PDTB). We performed cross validation on biomedical articles (approximately 100K word tokens) that we annotated. The results show that the classifier trained on PDTB data attained a 0.55 F1-score for identifying discourse connectives in biomedical text, while the cross-validation results in the biomedical text attained a 0.69 F1-score, a much better performance despite a much smalle