@article{Volkova_2010_WI,
title = {Boosting Biomedical Entity Extraction by Using Syntactic Patterns for Semantic Relation Discovery},
author = {Svitlana Volkova and Doina Caragea and William Hsu and John Drouhard and Landon Fowles},
year = {2010},
date = {2010-01-01},
pages = {272-278},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
abstract = {Biomedical entity extraction from unstructured web documents is an important task that needs to be performed in order to discover knowledge in the veterinary medicine domain. In general, this task can be approached by applying domain-specific ontologies, but a review of the literature shows that there is no universal dictionary, or ontology for this domain. To address this issue, we manually construct an ontology for extracting entities such as: animal disease names, viruses and serotypes. We then use an automated ontology expansion approach to extract semantic relationships between concepts. Such relationships include asserted synonymy, hyponymy and causality. Specifically, these relationships are extracted by using a set of syntactic patterns and part-of-speech tagging. The resulting ontology contains richer semantics compared to the manually-constructed ontology. We compare our approach for extracting synonyms, hyponyms and other disease related concepts, with an approach where the ontology is expanded using GoogleSets1, on the veterinary medicine entity extraction task. Experimental results show that our semantic relationship extraction approach produces a significant increase in precision and recall as compared to the GoogleSets approach.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

Biomedical entity extraction from unstructured web documents is an important task that needs to be performed in order to discover knowledge in the veterinary medicine domain. In general, this task can be approached by applying domain-specific ontologies, but a review of the literature shows that there is no universal dictionary, or ontology for this domain. To address this issue, we manually construct an ontology for extracting entities such as: animal disease names, viruses and serotypes. We then use an automated ontology expansion approach to extract semantic relationships between concepts. Such relationships include asserted synonymy, hyponymy and causality. Specifically, these relationships are extracted by using a set of syntactic patterns and part-of-speech tagging. The resulting ontology contains richer semantics compared to the manually-constructed ontology. We compare our approach for extracting synonyms, hyponyms and other disease related concepts, with an approach where the ontology is expanded using GoogleSets1, on the veterinary medicine entity extraction task. Experimental results show that our semantic relationship extraction approach produces a significant increase in precision and recall as compared to the GoogleSets approach.

@inproceedings{Volkova_2010_ISI,
title = {Computational Knowledge and Information Management in Veterinary Epidemiology},
author = {Svitlana Volkova and William Hsu},
year = {2010},
date = {2010-01-01},
booktitle = {IEEE International Conference on Intelligence and Security Informatics, ISI 2010, Vancouver, BC, Canada, May 23-26, 2010, Proceedings},
pages = {120-125},
publisher = {IEEE},
abstract = {Abstract—Predictive epidemiology is focused on spaciotemporal modeling of the spread of infectious diseases, with the goal of suggesting optimal mitigation strategies to control the impact of such diseases on the society and environment. Due to security reasons, the input data for epidemiological models is usually restricted in use. Such restrictions lead to information incompleteness that, in turn, drops the prediction accuracy. Moreover, a plethora of animal disease-related information is available online and it can be used as an input for the predictive models. In this paper we present an animal disease-related event recognition and classification approach in the domain of predictive epidemiology. We first extract domain-specific and domainindependent entities including disease names, dates, species with corresponding numbers and geo-locations. We then generate event tuples using extracted event attributes and classify them into three categories: susceptible, infected or recovered. Finally, we demonstrate how to use the results of our event recognition and classification approach as an input to a spatio-temporal model for epidemic spread prediction.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Abstract—Predictive epidemiology is focused on spaciotemporal modeling of the spread of infectious diseases, with the goal of suggesting optimal mitigation strategies to control the impact of such diseases on the society and environment. Due to security reasons, the input data for epidemiological models is usually restricted in use. Such restrictions lead to information incompleteness that, in turn, drops the prediction accuracy. Moreover, a plethora of animal disease-related information is available online and it can be used as an input for the predictive models. In this paper we present an animal disease-related event recognition and classification approach in the domain of predictive epidemiology. We first extract domain-specific and domainindependent entities including disease names, dates, species with corresponding numbers and geo-locations. We then generate event tuples using extracted event attributes and classify them into three categories: susceptible, infected or recovered. Finally, we demonstrate how to use the results of our event recognition and classification approach as an input to a spatio-temporal model for epidemic spread prediction.

@inproceedings{Volkova_2010_WWW,
title = {Animal Disease Event Recognition and Classification},
author = {Svitlana Volkova and Doina Caragea and William Hsu and Swathi Bujuru},
year = {2010},
date = {2010-01-01},
booktitle = {First International Workshop on Web Science and Information Exchange in the Medical Web, MedEx 2010, Raleigh, NC, USA, April 26, 2010},
pages = {51-61},
abstract = {Monitoring epidemic crises, caused by rapid spread of infectious animal diseases, can be facilitated by the plethora of information about disease-related events that is available online. Therefore, the ability to use this information to perform domain-speci fic entity recognition and event-related sentence classi fication, which in turn can support time and space visualization of automatically extracted events, is highly desirable. Towards this goal, we present a rule-based approach to the problem of extracting animal disease-related events from web documents. Our approach relies on the recognition of structured entity tuples, consisting of attributes, which describe events related to animal diseases. The event attributes that we consider include animal diseases, dates, species and geo-referenced locations. We perform disease names and species recognition using an automatically-constructed ontology, dates are extracted using regular expressions, while location are extracted using a conditional random fields tool. The extracted events are further classified as con firmed or suspected based on semantic features, obtained from the e.g., GoogleSets1 and WordNet2. Our preliminary results demonstrate the feasibility of the proposed approach.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Monitoring epidemic crises, caused by rapid spread of infectious animal diseases, can be facilitated by the plethora of information about disease-related events that is available online. Therefore, the ability to use this information to perform domain-speci fic entity recognition and event-related sentence classi fication, which in turn can support time and space visualization of automatically extracted events, is highly desirable. Towards this goal, we present a rule-based approach to the problem of extracting animal disease-related events from web documents. Our approach relies on the recognition of structured entity tuples, consisting of attributes, which describe events related to animal diseases. The event attributes that we consider include animal diseases, dates, species and geo-referenced locations. We perform disease names and species recognition using an automatically-constructed ontology, dates are extracted using regular expressions, while location are extracted using a conditional random fields tool. The extracted events are further classified as con firmed or suspected based on semantic features, obtained from the e.g., GoogleSets1 and WordNet2. Our preliminary results demonstrate the feasibility of the proposed approach.

We introduce Zipporah, a fast and scalable data cleaning system. We propose a
novel type of bag-of-words translation feature, and train logistic regression
models to classify good data and synthetic noisy data in the proposed feature
space. The trained model is used to score parallel sentences in the data pool
for selection. As shown in experiments, Zipporah selects a high-quality
parallel corpus from a large, mixed quality data pool. In particular, for one
noisy dataset, Zipporah achieves a 2.1 BLEU score improvement with using 1/5 of
the data over using the entire corpus.

We explore six challenges for neural machine translation: domain mismatch,
amount of training data, rare words, long sentences, word alignment, and beam
search. We show both deficiencies and improvements over the quality of
phrase-based statistical machine translation.

We present a feature-rich knowledge tracing method that captures a student's
acquisition and retention of knowledge during a foreign language phrase
learning task. We model the student's behavior as making predictions under a
log-linear model, and adopt a neural gating mechanism to model how the student
updates their log-linear parameters in response to feedback. The gating
mechanism allows the model to learn complex patterns of retention and
acquisition for each feature, while the log-linear parameterization results in
an interpretable knowledge state. We collect human data and evaluate several
versions of the model.

@article{Dredze:2017fv,
title = {Vaccine opponents' use of Twitter during the 2016 US presidential election: Implications for practice and policy},
author = {Mark Dredze and Zachary Wood-Doughty and Sandra Crouse Quinn and David A Broniatowski},
year = {2017},
date = {2017-07-01},
journal = {Vaccine},
abstract = {The recent inauguration of President Trump carries with it many public health policy implications. During the election, President Trump, like all political candidates, made policy commitments to various interest groups including vaccine skeptics. These groups celebrated the announcement that Robert Kennedy Jr., a noted proponent of a causal link between vaccines and autism, may chair a commission on vaccines. Furthermore, during the GOP primaries, Mr. Trump endorsed messages associated with vaccine refusal on Twitter, and met with prominent vaccine refusal advocates including Andrew Wakefield, who published the retracted and discredited 1998 Lancet article claiming to link autism to MMR vaccination. In this paper, we show that the new administration has mobilized vaccine refusal advocates, potentially enabling them to influence the national agenda in a manner that could lead to changes in existing vaccination policy.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

The recent inauguration of President Trump carries with it many public health policy implications. During the election, President Trump, like all political candidates, made policy commitments to various interest groups including vaccine skeptics. These groups celebrated the announcement that Robert Kennedy Jr., a noted proponent of a causal link between vaccines and autism, may chair a commission on vaccines. Furthermore, during the GOP primaries, Mr. Trump endorsed messages associated with vaccine refusal on Twitter, and met with prominent vaccine refusal advocates including Andrew Wakefield, who published the retracted and discredited 1998 Lancet article claiming to link autism to MMR vaccination. In this paper, we show that the new administration has mobilized vaccine refusal advocates, potentially enabling them to influence the national agenda in a manner that could lead to changes in existing vaccination policy.

We propose a new dependency parsing scheme which jointly parses a sentence and
repairs grammatical errors by extending the non-directional transition-based
formalism of Goldberg and Elhadad (2010) with three additional actions:
SUBSTITUTE, DELETE, INSERT. Because these actions may cause an infinite loop in
derivation, we also introduce simple constraints that ensure the parser
termination. We evaluate our model with respect to dependency accuracy and
grammaticality improvements for ungrammatical sentences, demonstrating the
robustness and applicability of our scheme.

Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers, pp. 229–234, Association for Computational Linguistics, Valencia, Spain, 2017.

Many domain adaptation approaches rely on learning cross domain shared representations to transfer the knowledge learned in one domain to other domains. Traditional domain adaptation only considers adapting for one task. In this paper, we explore multi-task representation learning under the domain adaptation scenario. We propose a neural network framework that supports domain adaptation for multiple tasks simultaneously, and learns shared representations that better generalize for domain adaptation. We apply the proposed framework to domain adaptation for sequence tagging problems considering two tasks: Chinese word segmentation and named entity recognition. Experiments show that multi-task domain adaptation works better than disjoint domain adaptation for each task, and achieves the state-of-the-art results for both tasks in the social media domain.

@inproceedings{Wood-Doughty:2017lr,
title = {How Does Twitter User Behavior Vary Across Demographic Groups?},
author = {Zach Wood-Doughty and Michael Smith and David Broniatowski and Mark Dredze},
year = {2017},
date = {2017-01-01},
booktitle = {ACL Workshop on Natural Language Processing and Computational Social Science},
abstract = {Demographically-tagged social media messages are a common source of data for computational social science. While these messages can indicate differences in beliefs and behaviors between demographic groups, we do not have a clear understanding of how different demographic groups use platforms such as Twitter. This paper presents a preliminary analysis of how groups' differing behaviors may confound analyses of the groups themselves. We analyzed one million Twitter users by first inferring demographic attributes, and then measuring several indicators of Twitter behavior. We find differences in these indicators across demographic groups, suggesting that there may be underlying differences in how different demographic groups use Twitter.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Demographically-tagged social media messages are a common source of data for computational social science. While these messages can indicate differences in beliefs and behaviors between demographic groups, we do not have a clear understanding of how different demographic groups use platforms such as Twitter. This paper presents a preliminary analysis of how groups' differing behaviors may confound analyses of the groups themselves. We analyzed one million Twitter users by first inferring demographic attributes, and then measuring several indicators of Twitter behavior. We find differences in these indicators across demographic groups, suggesting that there may be underlying differences in how different demographic groups use Twitter.

@article{Allem:2017qd,
title = {The Charlie Sheen Effect on Rapid In-home Human Immunodeficiency Virus Test Sales},
author = {Jon-Patrick Allem and Eric C Leas and Theodore L Caputi and Mark Dredze and Benjamin M Althouse and Seth M Noar and John W Ayers},
year = {2017},
date = {2017-01-01},
journal = {Prevention Science},
abstract = {One in eight of the 1.2 million Americans living with human immunodeficiency virus (HIV) are unaware of their positive status, and untested individuals are responsible for most new infections. As a result, testing is the most cost-effective HIV prevention strategy and must be accelerated when opportunities are presented. Web searches for HIV spiked around actor Charlie Sheen's HIV-positive disclosure. However, it is unknown whether Sheen's disclosure impacted offline behaviors like HIV testing. The goal of this study was to determine if Sheen's HIV disclosure was a record-setting HIV prevention event and determine if Web searches presage increases in testing allowing for rapid detection and reaction in the future. Sales of OraQuick rapid in-home HIV test kits in the USA were monitored weekly from April 12, 2014, to April 16, 2016, alongside Web searches including the terms ``test,'' ``tests,'' or ``testing'' and ``HIV'' as accessed from Google Trends. Changes in OraQuick sales around Sheen's disclosure and prediction models using Web searches were assessed. OraQuick sales rose 95% (95% CI, 75--117; p < 0.001) of the week of Sheen's disclosure and remained elevated for 4 more weeks (p < 0.05). In total, there were 8225 more sales than expected around Sheen's disclosure, surpassing World AIDS Day by a factor of about 7. Moreover, Web searches mirrored OraQuick sales trends (r = 0.79), demonstrating their ability to presage increases in testing. The ``Charlie Sheen effect'' represents an important opportunity for a public health response, and in the future, Web searches can be used to detect and act on more opportunities to foster prevention behaviors.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

One in eight of the 1.2 million Americans living with human immunodeficiency virus (HIV) are unaware of their positive status, and untested individuals are responsible for most new infections. As a result, testing is the most cost-effective HIV prevention strategy and must be accelerated when opportunities are presented. Web searches for HIV spiked around actor Charlie Sheen's HIV-positive disclosure. However, it is unknown whether Sheen's disclosure impacted offline behaviors like HIV testing. The goal of this study was to determine if Sheen's HIV disclosure was a record-setting HIV prevention event and determine if Web searches presage increases in testing allowing for rapid detection and reaction in the future. Sales of OraQuick rapid in-home HIV test kits in the USA were monitored weekly from April 12, 2014, to April 16, 2016, alongside Web searches including the terms ``test,'' ``tests,'' or ``testing'' and ``HIV'' as accessed from Google Trends. Changes in OraQuick sales around Sheen's disclosure and prediction models using Web searches were assessed. OraQuick sales rose 95% (95% CI, 75--117; p < 0.001) of the week of Sheen's disclosure and remained elevated for 4 more weeks (p < 0.05). In total, there were 8225 more sales than expected around Sheen's disclosure, surpassing World AIDS Day by a factor of about 7. Moreover, Web searches mirrored OraQuick sales trends (r = 0.79), demonstrating their ability to presage increases in testing. The ``Charlie Sheen effect'' represents an important opportunity for a public health response, and in the future, Web searches can be used to detect and act on more opportunities to foster prevention behaviors.

@inproceedings{Gao:2017wo,
title = {Support for Interactive Identification of Mentioned Entities in Conversational Speech},
author = {Ning Gao and Douglas Oard and Mark Dredze},
year = {2017},
date = {2017-01-01},
booktitle = {International Conference on Research and Development in Information Retrieval (SIGIR) (short paper)},
abstract = {Searching conversational speech poses several new challenges, among which is how the searcher will make sense of what they find. This paper describes our initial experiments with a freely available collection of Enron telephone conversations. Our goal is to help the user make sense of search results by finding information about mentioned people, places and organizations. Because automated entity recognition is not yet sufficiently accurate on conversational telephone speech, we ask the user to transcribe just the name, and to indicate where in the recording it was heard. We then seek to link that mention to other mentions of the same entity in a variety of sources (in our experiments, in email and in Wikipedia). We cast this as an entity linking problem, and achieve promising results by utilizing social network features to help compensate for the limited accuracy of automatic transcription for this challenging content.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Searching conversational speech poses several new challenges, among which is how the searcher will make sense of what they find. This paper describes our initial experiments with a freely available collection of Enron telephone conversations. Our goal is to help the user make sense of search results by finding information about mentioned people, places and organizations. Because automated entity recognition is not yet sufficiently accurate on conversational telephone speech, we ask the user to transcribe just the name, and to indicate where in the recording it was heard. We then seek to link that mention to other mentions of the same entity in a variety of sources (in our experiments, in email and in Wikipedia). We cast this as an entity linking problem, and achieve promising results by utilizing social network features to help compensate for the limited accuracy of automatic transcription for this challenging content.

Existing Knowledge Base Population methods extract relations from a closed relational schema with limited coverage, leading to sparse KBs. We propose Pocket Knowledge Base Population (PKBP), the task of dynamically constructing a KB of entities related to a query and finding the best characterization of relationships between entities. We describe novel Open Information Extraction methods which leverage the PKB to find informative trigger words. We evaluate using existing KBP shared-task data as well as new annotations collected for this work. Our methods produce high quality KBs from just text with many more entities and relationships than existing KBP systems.

@article{Gao:2017pb,
title = {Person Entity Linking in Email with NIL Detection},
author = {Ning Gao and Mark Dredze and Douglas Oard},
year = {2017},
date = {2017-01-01},
journal = {Journal of the Association for Information Science and Technology (JAIST)},
abstract = {For each specific mention of an entity found in a text, the goal of entity linking is to determine whether the referenced entity is present in an existing knowledge base, and if so to determine which KB entity is the correct referent. Entity linking has been well explored for dissemination-oriented sources such as news stories, blogs, and microblog posts, but the limited work to date on ``conversational'' sources such as email or text chat has not yet attempted to determine when the referent entity is not in the knowledge base (a task known as ``NIL detection''). This article presents a supervised machine learning system for linking named mentions of people in email messages to a collection-specific knowledge base, and that is also capable of NIL detection. This system learns from manually annotated training examples to leverage a rich set of features. The entity linking accuracy for entities present in the knowledge base is substantially and significantly better than the best previously reported results on the Enron email collection, comparable accuracy is reported for the challenging NIL detection task, and these results are for the first time replicated on a second email collection from a different source with comparable results.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

For each specific mention of an entity found in a text, the goal of entity linking is to determine whether the referenced entity is present in an existing knowledge base, and if so to determine which KB entity is the correct referent. Entity linking has been well explored for dissemination-oriented sources such as news stories, blogs, and microblog posts, but the limited work to date on ``conversational'' sources such as email or text chat has not yet attempted to determine when the referent entity is not in the knowledge base (a task known as ``NIL detection''). This article presents a supervised machine learning system for linking named mentions of people in email messages to a collection-specific knowledge base, and that is also capable of NIL detection. This system learns from manually annotated training examples to leverage a rich set of features. The entity linking accuracy for entities present in the knowledge base is substantially and significantly better than the best previously reported results on the Enron email collection, comparable accuracy is reported for the challenging NIL detection task, and these results are for the first time replicated on a second email collection from a different source with comparable results.

@unpublished{1702.05793,
title = {Harmonic Grammar, Optimality Theory, and Syntax Learnability: An Empirical Exploration of Czech Word Order},
author = {Ann Irvine and Mark Dredze},
year = {2017},
date = {2017-01-01},
abstract = {This work presents a systematic theoretical and empirical comparison of the major algorithms that have been proposed for learning Harmonic and Optimality Theory grammars (HG and OT, respectively). By comparing learning algorithms, we are also able to compare the closely related OT and HG frameworks themselves. Experimental results show that the additional expressivity of the HG framework over OT affords performance gains in the task of predicting the surface word order of Czech sentences. We compare the perceptron with the classic Gradual Learning Algorithm (GLA), which learns OT grammars, as well as the popular Maximum Entropy model. In addition to showing that the perceptron is theoretically appealing, our work shows that the performance of the HG model it learns approaches that of the upper bound in prediction accuracy on a held out test set and that it is capable of accurately modeling observed variation.},
keywords = {},
pubstate = {published},
tppubtype = {unpublished}
}

This work presents a systematic theoretical and empirical comparison of the major algorithms that have been proposed for learning Harmonic and Optimality Theory grammars (HG and OT, respectively). By comparing learning algorithms, we are also able to compare the closely related OT and HG frameworks themselves. Experimental results show that the additional expressivity of the HG framework over OT affords performance gains in the task of predicting the surface word order of Czech sentences. We compare the perceptron with the classic Gradual Learning Algorithm (GLA), which learns OT grammars, as well as the popular Maximum Entropy model. In addition to showing that the perceptron is theoretically appealing, our work shows that the performance of the HG model it learns approaches that of the upper bound in prediction accuracy on a held out test set and that it is capable of accurately modeling observed variation.

@inproceedings{Benton:2017lq,
title = {Ethical Research Protocols for Social Media Health Research},
author = {Adrian Benton and Glen Coppersmith and Mark Dredze},
year = {2017},
date = {2017-01-01},
booktitle = {EACL Workshop on Ethics in Natural Language Processing},
abstract = {Social media have transformed data driven research in political science, the social sciences, health, and medicine. Since health research often touches on sensitive topics that relate to ethics of treatment and patient privacy, similar ethical considerations should be acknowledged when using social media data in health research. While much has been said regarding the ethical considerations of social media research, health research leads to an additional set of concerns. We provide practical suggestions in the form of guidelines for researchers working with social media data in health research. These guidelines can inform an IRB proposal for researchers new to social media health research.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Social media have transformed data driven research in political science, the social sciences, health, and medicine. Since health research often touches on sensitive topics that relate to ethics of treatment and patient privacy, similar ethical considerations should be acknowledged when using social media data in health research. While much has been said regarding the ethical considerations of social media research, health research leads to an additional set of concerns. We provide practical suggestions in the form of guidelines for researchers working with social media data in health research. These guidelines can inform an IRB proposal for researchers new to social media health research.

@article{Ayers:2017qf,
title = {Why Do People Use Electronic Nicotine Delivery Systems (Electronic Cigarettes)? A Content Analysis of Twitter, 2012-2015},
author = {John Ayers and Eric C Leas and Jon-Patrick Allem and Adrian Benton and Mark Dredze and Benjamin M Althouse and Tess B Cruz and Jennifer B Unger},
year = {2017},
date = {2017-01-01},
journal = {PLoS One},
abstract = {The reasons for using electronic nicotine delivery systems (ENDS) are poorly understood and are primarily documented by expensive cross-sectional surveys that use preconceived close-ended response options rather than allowing respondents to use their own words. We passively identify the reasons for using ENDS longitudinally from a content analysis of public postings on Twitter. All English language public tweets including several ENDS terms (e.g., ``e-cigarette'' or ``vape'') were captured from the Twitter data stream during 2012 and 2015. After excluding spam, advertisements, and retweets, posts indicating a rationale for vaping were retained. The specific reasons for vaping were then inferred based on a supervised content analysis using annotators from Amazon's Mechanical Turk. During 2012 quitting combustibles was the most cited reason for using ENDS with 43% (95%CI 39--48) of all reason-related tweets cited quitting combustibles, e.g., ``I couldn't quit till I tried ecigs,'' eclipsing the second most cited reason by more than double. Other frequently cited reasons in 2012 included ENDS's social image (21%; 95%CI 18--25), use indoors (14%; 95%CI 11--17), flavors (14%; 95%CI 11--17), safety relative to combustibles (9%; 95%CI 7--11), cost (3%; 95%CI 2--5) and favorable odor (2%; 95%CI 1--3). By 2015 the reasons for using ENDS cited on Twitter had shifted. Both quitting combustibles and use indoors significantly declined in mentions to 29% (95%CI 24--33) and 12% (95%CI 9--16), respectively. At the same time, social image increased to 37% (95%CI 32--43) and lack of odor increased to 5% (95%CI 2--5), the former leading all cited reasons in 2015. Our data suggest the reasons people vape are shifting away from cessation and toward social image. The data also show how the ENDS market is responsive to a changing policy landscape. For instance, smoking indoors was less frequently cited in 2015 as indoor smoking restrictions became more common. Because the data and analytic approach are scalable, adoption of our strategies in the field can inform follow-up survey-based surveillance (so the right questions are asked), interventions, and policies for ENDS.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

The reasons for using electronic nicotine delivery systems (ENDS) are poorly understood and are primarily documented by expensive cross-sectional surveys that use preconceived close-ended response options rather than allowing respondents to use their own words. We passively identify the reasons for using ENDS longitudinally from a content analysis of public postings on Twitter. All English language public tweets including several ENDS terms (e.g., ``e-cigarette'' or ``vape'') were captured from the Twitter data stream during 2012 and 2015. After excluding spam, advertisements, and retweets, posts indicating a rationale for vaping were retained. The specific reasons for vaping were then inferred based on a supervised content analysis using annotators from Amazon's Mechanical Turk. During 2012 quitting combustibles was the most cited reason for using ENDS with 43% (95%CI 39--48) of all reason-related tweets cited quitting combustibles, e.g., ``I couldn't quit till I tried ecigs,'' eclipsing the second most cited reason by more than double. Other frequently cited reasons in 2012 included ENDS's social image (21%; 95%CI 18--25), use indoors (14%; 95%CI 11--17), flavors (14%; 95%CI 11--17), safety relative to combustibles (9%; 95%CI 7--11), cost (3%; 95%CI 2--5) and favorable odor (2%; 95%CI 1--3). By 2015 the reasons for using ENDS cited on Twitter had shifted. Both quitting combustibles and use indoors significantly declined in mentions to 29% (95%CI 24--33) and 12% (95%CI 9--16), respectively. At the same time, social image increased to 37% (95%CI 32--43) and lack of odor increased to 5% (95%CI 2--5), the former leading all cited reasons in 2015. Our data suggest the reasons people vape are shifting away from cessation and toward social image. The data also show how the ENDS market is responsive to a changing policy landscape. For instance, smoking indoors was less frequently cited in 2015 as indoor smoking restrictions became more common. Because the data and analytic approach are scalable, adoption of our strategies in the field can inform follow-up survey-based surveillance (so the right questions are asked), interventions, and policies for ENDS.

@article{Nastasi:2017qq,
title = {Breast Cancer Screening and Social Media: a Content Analysis of Evidence Use and Guideline Opinions on Twitter},
author = {Anthony Nastasi and Tyler Bryant and Joseph K Canner and Mark Dredze and Melissa S Camp and Neeraja Nagarajan},
year = {2017},
date = {2017-01-01},
journal = {Journal of Cancer Education},
abstract = {There is ongoing debate regarding the best mammography screening practices. Twitter has become a powerful tool for disseminating medical news and fostering healthcare conversations; however, little work has been done examining these conversations in the context of how users are sharing evidence and discussing current guidelines for breast cancer screening. To characterize the Twitter conversation on mammography and assess the quality of evidence used as well as opinions regarding current screening guidelines, individual tweets using mammography-related hashtags were prospectively pulled from Twitter from 5 November 2015 to 11 December 2015. Content analysis was performed on the tweets by abstracting data related to user demographics, content, evidence use, and guideline opinions. Standard descriptive statistics were used to summarize the results. Comparisons were made by demographics, tweet type (testable claim, advice, personal experience, etc.), and user type (non-healthcare, physician, cancer specialist, etc.). The primary outcomes were how users are tweeting about breast cancer screening, the quality of evidence they are using, and their opinions regarding guidelines. The most frequent user type of the 1345 tweets was ``non-healthcare'' with 323 tweets (32.5%). Physicians had 1.87 times higher odds (95% CI, 0.69--5.07) of providing explicit support with a reference and 11.70 times higher odds (95% CI, 3.41--40.13) of posting a tweet likely to be supported by the scientific community compared to non-healthcare users. Only 2.9% of guideline tweets approved of the guidelines while 14.6% claimed to be confused by them. Non-healthcare users comprise a significant proportion of participants in mammography conversations, with tweets often containing claims that are false, not explicitly backed by scientific evidence, and in favor of alternative ``natural'' breast cancer prevention and treatment. Furthermore, users appear to have low approval and confusion regarding screening guidelines. These findings suggest that more efforts are needed to educate and disseminate accurate information to the general public regarding breast cancer prevention modalities, emphasizing the safety of mammography and the harms of replacing conventional prevention and treatment modalities with unsubstantiated alternatives.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

There is ongoing debate regarding the best mammography screening practices. Twitter has become a powerful tool for disseminating medical news and fostering healthcare conversations; however, little work has been done examining these conversations in the context of how users are sharing evidence and discussing current guidelines for breast cancer screening. To characterize the Twitter conversation on mammography and assess the quality of evidence used as well as opinions regarding current screening guidelines, individual tweets using mammography-related hashtags were prospectively pulled from Twitter from 5 November 2015 to 11 December 2015. Content analysis was performed on the tweets by abstracting data related to user demographics, content, evidence use, and guideline opinions. Standard descriptive statistics were used to summarize the results. Comparisons were made by demographics, tweet type (testable claim, advice, personal experience, etc.), and user type (non-healthcare, physician, cancer specialist, etc.). The primary outcomes were how users are tweeting about breast cancer screening, the quality of evidence they are using, and their opinions regarding guidelines. The most frequent user type of the 1345 tweets was ``non-healthcare'' with 323 tweets (32.5%). Physicians had 1.87 times higher odds (95% CI, 0.69--5.07) of providing explicit support with a reference and 11.70 times higher odds (95% CI, 3.41--40.13) of posting a tweet likely to be supported by the scientific community compared to non-healthcare users. Only 2.9% of guideline tweets approved of the guidelines while 14.6% claimed to be confused by them. Non-healthcare users comprise a significant proportion of participants in mammography conversations, with tweets often containing claims that are false, not explicitly backed by scientific evidence, and in favor of alternative ``natural'' breast cancer prevention and treatment. Furthermore, users appear to have low approval and confusion regarding screening guidelines. These findings suggest that more efforts are needed to educate and disseminate accurate information to the general public regarding breast cancer prevention modalities, emphasizing the safety of mammography and the harms of replacing conventional prevention and treatment modalities with unsubstantiated alternatives.

@inproceedings{Huang:2017ygb,
title = {Examining Patterns of Influenza Vaccination in Social Media},
author = {Xiaolei Huang and Michael C Smith and Michael Paul and Dmytro Ryzhkov and Sandra Quinn and David Broniatowski and Mark Dredze},
year = {2017},
date = {2017-01-01},
booktitle = {AAAI Joint Workshop on Health Intelligence (W3PHIAI)},
abstract = {Traditional data on influenza vaccination has several limitations: high cost, limited coverage of underrepresented groups, and low sensitivity to emerging public health issues. Social media, such as Twitter, provide an alternative way to understand a population's vaccination-related opinions and behaviors. In this study, we build and employ several natural language classifiers to examine and analyze behavioral patterns regarding influenza vaccination in Twitter across three dimensions: temporality (by week and month), geography (by US region), and demography (by gender). Our best results are highly correlated official government data, with a correlation over 0.90, providing validation of our approach. We then suggest a number of directions for future work.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Traditional data on influenza vaccination has several limitations: high cost, limited coverage of underrepresented groups, and low sensitivity to emerging public health issues. Social media, such as Twitter, provide an alternative way to understand a population's vaccination-related opinions and behaviors. In this study, we build and employ several natural language classifiers to examine and analyze behavioral patterns regarding influenza vaccination in Twitter across three dimensions: temporality (by week and month), geography (by US region), and demography (by gender). Our best results are highly correlated official government data, with a correlation over 0.90, providing validation of our approach. We then suggest a number of directions for future work.

@inproceedings{Nagarajan:2017kx,
title = {The Utility of Twitter in Generating High-Quality Conversations about Surgical Care},
author = {Neeraja Nagarajan and Husain Alshaikh and Anthony Nastasi and Blair Smart and Zackary Berger and Eric B Schneider and Mark Dredze and Joseph K Canner and Nita Ahuja},
year = {2017},
date = {2017-01-01},
booktitle = {Academic Surgical Congress},
abstract = {Introduction: There is growing interest among various stakeholders in using social media sites to discuss healthcare issues. However, little is known about how social media sites are used to discuss surgical care. There is also a lack of understanding of the types of content generated and the quality of the information shared in social media platforms about surgical care issues. We therefore sought to identify and summarize conversations on surgical care in Twitter, a popular microblogging website. Methods: A comprehensive list of surgery-related hashtags was used to pull individual tweets from 3/27-4/27/2015. Four independent reviewers blindly analyzed 25 tweets to develop themes for extraction from a larger sample. The themes were broadly divided further to obtain data at the levels of the user, the tweet, the content of the tweet and personal information shared (Figure I). Standard descriptive statistical analysis and simple logistic regression analysis was used. Results: In total, 17,783 tweets were pulled and 1000 from 615 unique users were randomly selected for analysis. Most users were from North America (62.4%) and non-healthcare related individuals (31.8%). Healthcare organizations generated 12.4%, and surgeons 9.5%, of tweets. Overall, 67.4% were original tweets and 79.0% contained a hyperlink (11% to healthcare and 8.7% to peer-reviewed sources). The common areas of surgery discussed were global surgery/health systems (18.4%), followed by general surgery (15.6%). Among personal tweets (n=236), 31.1% concerned surgery on family/friends and 24.4% on the user; 61.1% discussed procedures already performed and 58.0% used positive language about their personal experience with surgical care. Surgical news/opinion was present in 45% of tweets and 13.7% contained evidence-based information. Non-healthcare professionals were 53.5% (95% CI: 3.8%-77.5%},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}

Introduction: There is growing interest among various stakeholders in using social media sites to discuss healthcare issues. However, little is known about how social media sites are used to discuss surgical care. There is also a lack of understanding of the types of content generated and the quality of the information shared in social media platforms about surgical care issues. We therefore sought to identify and summarize conversations on surgical care in Twitter, a popular microblogging website. Methods: A comprehensive list of surgery-related hashtags was used to pull individual tweets from 3/27-4/27/2015. Four independent reviewers blindly analyzed 25 tweets to develop themes for extraction from a larger sample. The themes were broadly divided further to obtain data at the levels of the user, the tweet, the content of the tweet and personal information shared (Figure I). Standard descriptive statistical analysis and simple logistic regression analysis was used. Results: In total, 17,783 tweets were pulled and 1000 from 615 unique users were randomly selected for analysis. Most users were from North America (62.4%) and non-healthcare related individuals (31.8%). Healthcare organizations generated 12.4%, and surgeons 9.5%, of tweets. Overall, 67.4% were original tweets and 79.0% contained a hyperlink (11% to healthcare and 8.7% to peer-reviewed sources). The common areas of surgery discussed were global surgery/health systems (18.4%), followed by general surgery (15.6%). Among personal tweets (n=236), 31.1% concerned surgery on family/friends and 24.4% on the user; 61.1% discussed procedures already performed and 58.0% used positive language about their personal experience with surgical care. Surgical news/opinion was present in 45% of tweets and 13.7% contained evidence-based information. Non-healthcare professionals were 53.5% (95% CI: 3.8%-77.5%

@article{NOAR2017,
title = {Can a selfie promote public engagement with skin cancer?},
author = {Seth M Noar and Eric Leas and Benjamin M Althouse and Mark Dredze and Dannielle Kelley and John W Ayers},
doi = {https://doi.org/10.1016/j.ypmed.2017.10.038},
issn = {0091-7435},
year = {2017},
date = {2017-01-01},
journal = {Preventive Medicine},
abstract = {Social media may provide new opportunities to promote skin cancer prevention, but research to understand this potential is needed. In April of 2015, Kentucky native Tawny Willoughby (TW) shared a graphic skin cancer selfie on Facebook that subsequently went viral. We examined the volume of comments and shares of her original Facebook post; news volume of skin cancer from Google News; and search volume for skin cancer Google queries. We compared these latter metrics after TWs announcement against expected volumes based on forecasts of historical trends. TW's skin cancer selfie went viral on May 11, 2015 after the social media post had been shared approximately 50,000 times. All search queries for skin cancer increased 162% (95% CI 102 to 320) and 155% (95% CI 107 to 353) on May 13th and 14th, when news about TW's skin cancer selfie was at its peak, and remained higher through May 17th. Google searches about skin cancer prevention and tanning were also significantly higher than expected volumes. In practical terms, searches reached near-record levels - i.e., May 13th, 14th and 15th were respectively the 6th, 8th, and 40th most searched days for skin cancer since January 1, 2004 when Google began tracking searches. We conclude that an ordinary person's social media post caught the public's imagination and led to significant increases in public engagement with skin cancer prevention. Digital surveillance methods can rapidly detect these events in near real time, allowing public health practitioners to engage and potentially elevate positive effects.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}

Social media may provide new opportunities to promote skin cancer prevention, but research to understand this potential is needed. In April of 2015, Kentucky native Tawny Willoughby (TW) shared a graphic skin cancer selfie on Facebook that subsequently went viral. We examined the volume of comments and shares of her original Facebook post; news volume of skin cancer from Google News; and search volume for skin cancer Google queries. We compared these latter metrics after TWs announcement against expected volumes based on forecasts of historical trends. TW's skin cancer selfie went viral on May 11, 2015 after the social media post had been shared approximately 50,000 times. All search queries for skin cancer increased 162% (95% CI 102 to 320) and 155% (95% CI 107 to 353) on May 13th and 14th, when news about TW's skin cancer selfie was at its peak, and remained higher through May 17th. Google searches about skin cancer prevention and tanning were also significantly higher than expected volumes. In practical terms, searches reached near-record levels - i.e., May 13th, 14th and 15th were respectively the 6th, 8th, and 40th most searched days for skin cancer since January 1, 2004 when Google began tracking searches. We conclude that an ordinary person's social media post caught the public's imagination and led to significant increases in public engagement with skin cancer prevention. Digital surveillance methods can rapidly detect these events in near real time, allowing public health practitioners to engage and potentially elevate positive effects.

Upcoming Seminars

Abstract Discourse relations such as ‘contrast’, ‘cause’ or ‘evidence’ are often postulated to explain how humans understand the function of one sentence in relation to another. Some relations are signaled rather directly using words such[...]

Abstract Speech applications such as text-to-speech (TTS) or automatic speech recognition (ASR), must not only know how to read ordinary words, but must also know how to read numbers, abbreviations, measure expressions, times, dates, and[...]