Publications

BibTeX Citation:

@article{maini2025safety,
  title = {Safety Pretraining: Toward the next generation of Safe AI},
  author = {Maini, Pratyush and Goyal, Sachin and Sam, Dylan and Robey, Alex and Savani, Yash and Jiang, Yiding and Zou, Andy and Fredrikson, Matt and Lipton, Zachary C. and Kolter, J. Zico},
  journal = {arXiv preprint arXiv:2504.16980},
  year = {2025},
  selected = {true},
  keywords = {Data Curation, Memorization},
  tldr = {A framework for training AI systems to be inherently safer through specialized pretraining.}
}

2024

ACL

Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling

Pratyush Maini, Skyler Seto, He Bai, David Grangier, Yizhe Zhang, and Navdeep Jaitly

In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Aug 2024

BibTeX Citation:

@inproceedings{maini2024rephrasing,
  title = {Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling},
  author = {Maini, Pratyush and Seto, Skyler and Bai, He and Grangier, David and Zhang, Yizhe and Jaitly, Navdeep},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.acl-long.757},
  pages = {14044--14072},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Paraphrasing web data into Q/A pairs significantly improves language model training efficiency.},
  tweet = {https://x.com/pratyushmaini/status/1752337225097076809}
}

NeurIPS

Oral
Private-NLP Workshop

LLM Dataset Inference: Did you train on my dataset?

Pratyush Maini^*, Hengrui Jia^*, Nicolas Papernot, and Adam Dziedzic

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, Aug 2024

TLDR arXiv Citation PDF Tweet

BibTeX Citation:

@inproceedings{maini2024llm,
  title = {{LLM} Dataset Inference: Did you train on my dataset?},
  author = {Maini, Pratyush and Jia, Hengrui and Papernot, Nicolas and Dziedzic, Adam},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=Fr9d1UMc37},
  keywords = {Memorization},
  selected = {true},
  tldr = {Black-box detection of whether a dataset was used to train an LLM.},
  awards = {Oral (Private-NLP Workshop)},
  venues = {NeurIPS 2024; Private-NLP @ ACL 2024 (Oral)},
  tweet = {https://x.com/pratyushmaini/status/1800935108670816267}
}

COLM

Oral
Set-LLM Workshop

TOFU: A Task of Fictitious Unlearning for LLMs

Pratyush Maini^*, Zhili Feng^*, Avi Schwarzschild^*, Zachary C. Lipton, and J. Zico Kolter

In , Aug 2024

TLDR Citation Tweet Website

BibTeX Citation:

@inproceedings{tofu2024,
  title = {TOFU: A Task of Fictitious Unlearning for LLMs},
  author = {Maini, Pratyush and Feng, Zhili and Schwarzschild, Avi and Lipton, Zachary C. and Kolter, J. Zico},
  year = {2024},
  archiveprefix = {Conference on Language Modeling},
  primaryclass = {cs.LG},
  keywords = {Memorization},
  selected = {true},
  tldr = {Benchmarking machine unlearning methods for large language models.},
  awards = {Oral (Set-LLM Workshop)},
  tweet = {https://x.com/_akhaliq/status/1745643293839327268}
}

CVPR

Best Paper
DPFM Workshop

Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic

Sachin Goyal, Pratyush Maini, Zachary C. Lipton, Aditi Raghunathan, and J. Zico Kolter

In Conference on Computer Vision and Pattern Recognition, Aug 2024

BibTeX Citation:

@inproceedings{goyal2024scaling,
  title = {Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic},
  author = {Goyal, Sachin and Maini, Pratyush and Lipton, Zachary C. and Raghunathan, Aditi and Kolter, J. Zico},
  booktitle = {Conference on Computer Vision and Pattern Recognition},
  year = {2024},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Training compute-optimal models requires data filtering that scales with available compute.},
  awards = {Best Paper (DPFM Workshop)},
  tweet = {https://x.com/pratyushmaini/status/1778577153107570770, https://x.com/arankomatsuzaki/status/1778230945390133278}
}

2023

ICML

Can Neural Network Memorization Be Localized?

Pratyush Maini, Michael C. Mozer, Hanie Sedghi, Zachary C. Lipton, J. Zico Kolter, and Chiyuan Zhang

In International Conference on Machine Learning, Aug 2023

TLDR Citation Poster Slides Tweet Website

BibTeX Citation:

@inproceedings{maini2023memorization,
  title = {Can Neural Network Memorization Be Localized?},
  author = {Maini, Pratyush and Mozer, Michael C. and Sedghi, Hanie and Lipton, Zachary C. and Kolter, J. Zico and Zhang, Chiyuan},
  booktitle = {International Conference on Machine Learning},
  year = {2023},
  keywords = {Memorization},
  selected = {true},
  tldr = {Individual neurons and layers do not solely determine what a neural network memorizes.},
  tweet = {https://x.com/pratyushmaini/status/1684611469407883264}
}

2021

ICLR

Spotlight

Dataset Inference: Ownership Resolution in Machine Learning

Pratyush Maini, Mohammad Yaghini, and Nicolas Papernot

Aug 2021

Spotlight Award

TLDR Citation Video Poster Slides Tweet

BibTeX Citation:

@article{maini2021dataset,
  title = {Dataset Inference: Ownership Resolution in Machine Learning},
  author = {Maini, Pratyush and Yaghini, Mohammad and Papernot, Nicolas},
  booktitle = {International Conference on Learning Representations},
  year = {2021},
  url = {https://openreview.net/forum?id=hvdKKV2yt7T},
  note = {Spotlight Award},
  keywords = {Memorization},
  selected = {true},
  tldr = {First work on dataset inference: determining if a dataset was used for training.},
  awards = {Spotlight},
  tweet = {https://twitter.com/pratyushmaini/status/1740000000000000000}
}

2025

ICML

Memorization Sinks: Isolating Memorization during LLM Training

Gaurav R. Ghosal, Pratyush Maini, and Aditi Raghunathan

In International Conference on Machine Learning, 2025

BibTeX Citation:

@inproceedings{ghosal2025memorization,
  title = {Memorization Sinks: Isolating Memorization during LLM Training},
  author = {Ghosal, Gaurav R. and Maini, Pratyush and Raghunathan, Aditi},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/gaurav_ghosal/status/1945867641283031098}
}

ICML

Oral
Dig-BUGS Workshop

Unlocking Post-hoc Dataset Inference with Synthetic Data

Bihe Zhao, Pratyush Maini, Franziska Boenisch, and Adam Dziedzic

In International Conference on Machine Learning, 2025

arXiv Citation

BibTeX Citation:

@inproceedings{zhao2025unlocking,
  title = {Unlocking Post-hoc Dataset Inference with Synthetic Data},
  author = {Zhao, Bihe and Maini, Pratyush and Boenisch, Franziska and Dziedzic, Adam},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  awards = {Oral (Dig-BUGS Workshop)}
}

arXiv

OpenUnlearning: Accelerating LLM Unlearning via Unified Benchmarking of Methods and Metrics

Vineeth Dorna, Anmol Mekala, Wenlong Zhao, Andrew McCallum, Zachary C. Lipton, J. Zico Kolter, and Pratyush Maini

arXiv preprint arXiv:2506.12618, 2025

BibTeX Citation:

@article{dorna2025openunlearning,
  title = {OpenUnlearning: Accelerating LLM Unlearning via Unified Benchmarking of Methods and Metrics},
  author = {Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Lipton, Zachary C. and Kolter, J. Zico and Maini, Pratyush},
  journal = {arXiv preprint arXiv:2506.12618},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1901641900202831905}
}

ICML

STAMP Your Content: Proving Dataset Membership via Watermarked Rephrasings

Saksham Rastogi, Pratyush Maini, and Danish Pruthi

In International Conference on Machine Learning, 2025

BibTeX Citation:

@inproceedings{rastogi2025stamp,
  title = {STAMP Your Content: Proving Dataset Membership via Watermarked Rephrasings},
  author = {Rastogi, Saksham and Maini, Pratyush and Pruthi, Danish},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1945546473942519854}
}

Workshop

Oral
MemFM Workshop

MAGIC: Diffusion Model Memorization Auditing via Generative Image Compression

Gunjan Dhanuka, Sumukh K. Aithal, Avi Schwarzschild, Zhili Feng, J. Zico Kolter, Zachary C. Lipton, and Pratyush Maini

In The Impact of Memorization on Trustworthy Foundation Models: ICML 2025 Workshop, 2025

BibTeX Citation:

@inproceedings{dhanuka2025magic,
  title = {MAGIC: Diffusion Model Memorization Auditing via Generative Image Compression},
  author = {Dhanuka, Gunjan and Aithal, Sumukh K. and Schwarzschild, Avi and Feng, Zhili and Kolter, J. Zico and Lipton, Zachary C. and Maini, Pratyush},
  booktitle = {The Impact of Memorization on Trustworthy Foundation Models: ICML 2025 Workshop},
  year = {2025},
  keywords = {Memorization},
  awards = {Oral (MemFM Workshop)}
}

arXiv

Safety Pretraining: Toward the next generation of Safe AI

Pratyush Maini^*, Sachin Goyal^*, Dylan Sam^*, Alex Robey, Yash Savani, Yiding Jiang, Andy Zou, Matt Fredrikson, Zachary C. Lipton, and J. Zico Kolter

arXiv preprint arXiv:2504.16980, 2025

BibTeX Citation:

@article{maini2025safety,
  title = {Safety Pretraining: Toward the next generation of Safe AI},
  author = {Maini, Pratyush and Goyal, Sachin and Sam, Dylan and Robey, Alex and Savani, Yash and Jiang, Yiding and Zou, Andy and Fredrikson, Matt and Lipton, Zachary C. and Kolter, J. Zico},
  journal = {arXiv preprint arXiv:2504.16980},
  year = {2025},
  selected = {true},
  keywords = {Data Curation, Memorization},
  tldr = {A framework for training AI systems to be inherently safer through specialized pretraining.}
}

2024

ICLR Blogpost

Reassessing EMNLP 2024’s Best Paper: Does Divergence-Based Calibration for MIAs Hold Up?

Pratyush Maini, and Anshuman Suri

The Fourth Blogpost Track at ICLR 2025, 2024

BibTeX Citation:

@article{maini2024reassessing,
  title = {Reassessing EMNLP 2024's Best Paper: Does Divergence-Based Calibration for MIAs Hold Up?},
  author = {Maini, Pratyush and Suri, Anshuman},
  journal = {The Fourth Blogpost Track at ICLR 2025},
  year = {2024},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1861471365775401022}
}

NeurIPS

Oral
Private-NLP Workshop

LLM Dataset Inference: Did you train on my dataset?

Pratyush Maini^*, Hengrui Jia^*, Nicolas Papernot, and Adam Dziedzic

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, 2024

TLDR arXiv Citation PDF Tweet

BibTeX Citation:

@inproceedings{maini2024llm,
  title = {{LLM} Dataset Inference: Did you train on my dataset?},
  author = {Maini, Pratyush and Jia, Hengrui and Papernot, Nicolas and Dziedzic, Adam},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=Fr9d1UMc37},
  keywords = {Memorization},
  selected = {true},
  tldr = {Black-box detection of whether a dataset was used to train an LLM.},
  awards = {Oral (Private-NLP Workshop)},
  venues = {NeurIPS 2024; Private-NLP @ ACL 2024 (Oral)},
  tweet = {https://x.com/pratyushmaini/status/1800935108670816267}
}

NeurIPS

Best Paper
CONDA Workshop

Rethinking LLM Memorization through the Lens of Adversarial Compression

Avi Schwarzschild, Zhili Feng, Pratyush Maini, Zachary C. Lipton, and J. Zico Kolter

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, 2024

BibTeX Citation:

@inproceedings{schwarzschild2024rethinking,
  title = {Rethinking {LLM} Memorization through the Lens of Adversarial Compression},
  author = {Schwarzschild, Avi and Feng, Zhili and Maini, Pratyush and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=KFmRMvzAZy},
  keywords = {Memorization},
  awards = {Best Paper (CONDA Workshop)},
  tweet = {https://x.com/pratyushmaini/status/1783572891759702442}
}

NeurIPS

Understanding Hallucinations in Diffusion Models through Mode Interpolation

Sumukh K. Aithal, Pratyush Maini, Zachary C. Lipton, and J. Zico Kolter

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, 2024

BibTeX Citation:

@inproceedings{aithal2024understanding,
  title = {Understanding Hallucinations in Diffusion Models through Mode Interpolation},
  author = {Aithal, Sumukh K. and Maini, Pratyush and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=aNTnHBkw4T},
  keywords = {Memorization},
  tweet = {https://x.com/sumukhaithal6/status/1801682911349063966}
}

COLM

Oral
Set-LLM Workshop

TOFU: A Task of Fictitious Unlearning for LLMs

Pratyush Maini^*, Zhili Feng^*, Avi Schwarzschild^*, Zachary C. Lipton, and J. Zico Kolter

In , 2024

TLDR Citation Tweet Website

BibTeX Citation:

@inproceedings{tofu2024,
  title = {TOFU: A Task of Fictitious Unlearning for LLMs},
  author = {Maini, Pratyush and Feng, Zhili and Schwarzschild, Avi and Lipton, Zachary C. and Kolter, J. Zico},
  year = {2024},
  archiveprefix = {Conference on Language Modeling},
  primaryclass = {cs.LG},
  keywords = {Memorization},
  selected = {true},
  tldr = {Benchmarking machine unlearning methods for large language models.},
  awards = {Oral (Set-LLM Workshop)},
  tweet = {https://x.com/_akhaliq/status/1745643293839327268}
}

2023

ICML

Can Neural Network Memorization Be Localized?

Pratyush Maini, Michael C. Mozer, Hanie Sedghi, Zachary C. Lipton, J. Zico Kolter, and Chiyuan Zhang

In International Conference on Machine Learning, 2023

TLDR Citation Poster Slides Tweet Website

BibTeX Citation:

@inproceedings{maini2023memorization,
  title = {Can Neural Network Memorization Be Localized?},
  author = {Maini, Pratyush and Mozer, Michael C. and Sedghi, Hanie and Lipton, Zachary C. and Kolter, J. Zico and Zhang, Chiyuan},
  booktitle = {International Conference on Machine Learning},
  year = {2023},
  keywords = {Memorization},
  selected = {true},
  tldr = {Individual neurons and layers do not solely determine what a neural network memorizes.},
  tweet = {https://x.com/pratyushmaini/status/1684611469407883264}
}

2021

ICLR

Spotlight

Dataset Inference: Ownership Resolution in Machine Learning

Pratyush Maini, Mohammad Yaghini, and Nicolas Papernot

2021

Spotlight Award

TLDR Citation Video Poster Slides Tweet

BibTeX Citation:

@article{maini2021dataset,
  title = {Dataset Inference: Ownership Resolution in Machine Learning},
  author = {Maini, Pratyush and Yaghini, Mohammad and Papernot, Nicolas},
  booktitle = {International Conference on Learning Representations},
  year = {2021},
  url = {https://openreview.net/forum?id=hvdKKV2yt7T},
  note = {Spotlight Award},
  keywords = {Memorization},
  selected = {true},
  tldr = {First work on dataset inference: determining if a dataset was used for training.},
  awards = {Spotlight},
  tweet = {https://twitter.com/pratyushmaini/status/1740000000000000000}
}

2025

arXiv

BeyondWeb: Lessons from Scaling Synthetic Data for Trillion-scale Pretraining

Pratyush Maini, Vineeth Dorna, Parth Doshi, Aldo Carranza, Fan Pan, Jack Urbanek, Paul Burstein, Alex Fang, Alvin Deng, Amro Abbas, Brett Larsen, Cody Blakeney, Charvi Bannur, Christina Baek, Darren Teh, David Schwab, Haakon Mongstad, Haoli Yin, Josh Wills, Kaleigh Mentzer, Luke Merrick, Ricardo Monti, Rishabh Adiga, Siddharth Joshi, Spandan Das, Zhengping Wang, Bogdan Gaza, Ari Morcos, and Matthew Leavitt

arXiv preprint arXiv:2508.10975, 2025

BibTeX Citation:

@article{maini2025beyondweb,
  title = {BeyondWeb: Lessons from Scaling Synthetic Data for Trillion-scale Pretraining},
  author = {Maini, Pratyush and Dorna, Vineeth and Doshi, Parth and Carranza, Aldo and Pan, Fan and Urbanek, Jack and Burstein, Paul and Fang, Alex and Deng, Alvin and Abbas, Amro and Larsen, Brett and Blakeney, Cody and Bannur, Charvi and Baek, Christina and Teh, Darren and Schwab, David and Mongstad, Haakon and Yin, Haoli and Wills, Josh and Mentzer, Kaleigh and Merrick, Luke and Monti, Ricardo and Adiga, Rishabh and Joshi, Siddharth and Das, Spandan and Wang, Zhengping and Gaza, Bogdan and Morcos, Ari and Leavitt, Matthew},
  journal = {arXiv preprint arXiv:2508.10975},
  year = {2025},
  keywords = {Data Curation},
  tweet = {https://x.com/pratyushmaini/status/1957456720265154752}
}

arXiv

Safety Pretraining: Toward the next generation of Safe AI

Pratyush Maini^*, Sachin Goyal^*, Dylan Sam^*, Alex Robey, Yash Savani, Yiding Jiang, Andy Zou, Matt Fredrikson, Zachary C. Lipton, and J. Zico Kolter

arXiv preprint arXiv:2504.16980, 2025

BibTeX Citation:

@article{maini2025safety,
  title = {Safety Pretraining: Toward the next generation of Safe AI},
  author = {Maini, Pratyush and Goyal, Sachin and Sam, Dylan and Robey, Alex and Savani, Yash and Jiang, Yiding and Zou, Andy and Fredrikson, Matt and Lipton, Zachary C. and Kolter, J. Zico},
  journal = {arXiv preprint arXiv:2504.16980},
  year = {2025},
  selected = {true},
  keywords = {Data Curation, Memorization},
  tldr = {A framework for training AI systems to be inherently safer through specialized pretraining.}
}

ICLR Blogpost

Peeking Behind Closed Doors: Risks of LLM Evaluation by Private Data Curators

Harshay Bansal, and Pratyush Maini

The Fourth Blogpost Track at ICLR 2025, 2025

BibTeX Citation:

@article{bansal2025peeking,
  title = {Peeking Behind Closed Doors: Risks of LLM Evaluation by Private Data Curators},
  author = {Bansal, Harshay and Maini, Pratyush},
  journal = {The Fourth Blogpost Track at ICLR 2025},
  year = {2025},
  keywords = {Data Curation},
  tweet = {https://x.com/hbXNov/status/1861826049086201970}
}

2024

ACL

Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling

Pratyush Maini, Skyler Seto, He Bai, David Grangier, Yizhe Zhang, and Navdeep Jaitly

In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Aug 2024

BibTeX Citation:

@inproceedings{maini2024rephrasing,
  title = {Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling},
  author = {Maini, Pratyush and Seto, Skyler and Bai, He and Grangier, David and Zhang, Yizhe and Jaitly, Navdeep},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.acl-long.757},
  pages = {14044--14072},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Paraphrasing web data into Q/A pairs significantly improves language model training efficiency.},
  tweet = {https://x.com/pratyushmaini/status/1752337225097076809}
}

CVPR

Best Paper
DPFM Workshop

Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic

Sachin Goyal, Pratyush Maini, Zachary C. Lipton, Aditi Raghunathan, and J. Zico Kolter

In Conference on Computer Vision and Pattern Recognition, Aug 2024

BibTeX Citation:

@inproceedings{goyal2024scaling,
  title = {Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic},
  author = {Goyal, Sachin and Maini, Pratyush and Lipton, Zachary C. and Raghunathan, Aditi and Kolter, J. Zico},
  booktitle = {Conference on Computer Vision and Pattern Recognition},
  year = {2024},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Training compute-optimal models requires data filtering that scales with available compute.},
  awards = {Best Paper (DPFM Workshop)},
  tweet = {https://x.com/pratyushmaini/status/1778577153107570770, https://x.com/arankomatsuzaki/status/1778230945390133278}
}

ICLR

Contributed Oral
Datacomp Workshop

T-MARS: Improving Visual Representations by Circumventing Text Feature Learning

Pratyush Maini, Sachin Goyal, Zachary C. Lipton, J. Zico Kolter, and Aditi Raghunathan

In International Conference on Learning Representations, Aug 2024

arXiv Citation Poster Tweet Website

BibTeX Citation:

@inproceedings{maini2023tmars,
  title = {T-MARS: Improving Visual Representations by Circumventing Text Feature Learning},
  author = {Maini, Pratyush and Goyal, Sachin and Lipton, Zachary C. and Kolter, J. Zico and Raghunathan, Aditi},
  booktitle = {International Conference on Learning Representations},
  year = {2024},
  keywords = {Data Curation},
  awards = {Contributed Oral (Datacomp Workshop)},
  tweet = {https://x.com/goyalsachin007/status/1677314439236681730}
}

2022

NeurIPS

Best Paper Nominee

Oral
SCIS Workshop

Characterizing Datapoints via Second-Split Forgetting

Pratyush Maini, Saurabh Garg, Zachary C. Lipton, and J. Zico Kolter

In Advances in Neural Information Processing Systems, Aug 2022

Award Nominee

Citation Poster Slides Tweet

BibTeX Citation:

@inproceedings{maini2022characterizing,
  title = {Characterizing Datapoints via Second-Split Forgetting},
  author = {Maini, Pratyush and Garg, Saurabh and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {Oh, Alice H. and Agarwal, Alekh and Belgrave, Danielle and Cho, Kyunghyun},
  year = {2022},
  url = {https://openreview.net/forum?id=yKDKNzjHg8N},
  note = {Award Nominee},
  awards = {Best Paper Nominee; Oral (SCIS Workshop)},
  keywords = {Data Curation},
  tweet = {https://x.com/pratyushmaini/status/1597594267702394882}
}

2023

EMNLP

Model-tuning via prompts makes NLP models adversarially robust

Mrigank Raman, Pratyush Maini, J. Zico Kolter, Zachary C. Lipton, and Danish Pruthi

In Empirical Methods in Natural Language Processing, 2023

arXiv Citation Poster Slides

BibTeX Citation:

@inproceedings{raman2023modeltuning,
  title = {Model-tuning via prompts makes NLP models adversarially robust},
  author = {Raman, Mrigank and Maini, Pratyush and Kolter, J. Zico and Lipton, Zachary C. and Pruthi, Danish},
  booktitle = {Empirical Methods in Natural Language Processing},
  year = {2023},
}

2022

UAI

Perturbation Type Categorization for Multiple \ell_p Bounded Adversarial Robustness

Pratyush Maini, Xinyun Chen, Bo Li, and Dawn Song

In Proceedings of The 38th Uncertainty in Artificial Intelligence Conference, 2022

BibTeX Citation:

@inproceedings{maini2022perturbation,
  title = {Perturbation Type Categorization for Multiple $\ell_p$ Bounded Adversarial Robustness},
  author = {Maini, Pratyush and Chen, Xinyun and Li, Bo and Song, Dawn},
  booktitle = {Proceedings of The 38th Uncertainty in Artificial Intelligence Conference},
  year = {2022},
  series = {Proceedings of Machine Learning Research},
  url = {https://openreview.net/pdf?id=Oe2XI-Aft-k},
}

2021

CVPR

Data-free model extraction

Jean-Baptiste Truong^*, Pratyush Maini^*, Robert J Walls, and Nicolas Papernot

In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition, 2021

Citation Code Poster

BibTeX Citation:

@inproceedings{truong2021data,
  title = {Data-free model extraction},
  author = {Truong, Jean-Baptiste and Maini, Pratyush and Walls, Robert J and Papernot, Nicolas},
  booktitle = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages = {4771--4780},
  year = {2021},
}

2020

ICML

Adversarial Robustness Against the Union of Multiple Perturbation Models

Pratyush Maini, Eric Wong, and J. Zico Kolter

In International Conference on Machine Learning, 2020

Citation Video Code Slides

BibTeX Citation:

@inproceedings{maini2020adversarial,
  title = {Adversarial Robustness Against the Union of Multiple Perturbation Models},
  author = {Maini, Pratyush and Wong, Eric and Kolter, J. Zico},
  booktitle = {International Conference on Machine Learning},
  year = {2020},
  url = {https://arxiv.org/abs/1909.04068},
}

EMNLP

Why and when should you pool? Analyzing Pooling in Recurrent Architectures

Pratyush Maini, Keshav Kolluru, Danish Pruthi, and Mausam

In Findings of the Association for Computational Linguistics: EMNLP, 2020

Also presented at BlackBoxNLP’20

Citation Blog Code Poster Slides

BibTeX Citation:

@inproceedings{maini2020pool,
  title = {Why and when should you pool? Analyzing Pooling in Recurrent Architectures},
  author = {Maini, Pratyush and Kolluru, Keshav and Pruthi, Danish and Mausam},
  booktitle = {Findings of the Association for Computational Linguistics: EMNLP},
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.findings-emnlp.410},
  note = {Also presented at BlackBoxNLP'20},
}

2025

ICML

Memorization Sinks: Isolating Memorization during LLM Training

Gaurav R. Ghosal, Pratyush Maini, and Aditi Raghunathan

In International Conference on Machine Learning, 2025

BibTeX Citation:

@inproceedings{ghosal2025memorization,
  title = {Memorization Sinks: Isolating Memorization during LLM Training},
  author = {Ghosal, Gaurav R. and Maini, Pratyush and Raghunathan, Aditi},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/gaurav_ghosal/status/1945867641283031098}
}

ICML

Oral
Dig-BUGS Workshop

Unlocking Post-hoc Dataset Inference with Synthetic Data

Bihe Zhao, Pratyush Maini, Franziska Boenisch, and Adam Dziedzic

In International Conference on Machine Learning, 2025

arXiv Citation

BibTeX Citation:

@inproceedings{zhao2025unlocking,
  title = {Unlocking Post-hoc Dataset Inference with Synthetic Data},
  author = {Zhao, Bihe and Maini, Pratyush and Boenisch, Franziska and Dziedzic, Adam},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  awards = {Oral (Dig-BUGS Workshop)}
}

arXiv

OpenUnlearning: Accelerating LLM Unlearning via Unified Benchmarking of Methods and Metrics

Vineeth Dorna, Anmol Mekala, Wenlong Zhao, Andrew McCallum, Zachary C. Lipton, J. Zico Kolter, and Pratyush Maini

arXiv preprint arXiv:2506.12618, 2025

BibTeX Citation:

@article{dorna2025openunlearning,
  title = {OpenUnlearning: Accelerating LLM Unlearning via Unified Benchmarking of Methods and Metrics},
  author = {Dorna, Vineeth and Mekala, Anmol and Zhao, Wenlong and McCallum, Andrew and Lipton, Zachary C. and Kolter, J. Zico and Maini, Pratyush},
  journal = {arXiv preprint arXiv:2506.12618},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1901641900202831905}
}

ICML

STAMP Your Content: Proving Dataset Membership via Watermarked Rephrasings

Saksham Rastogi, Pratyush Maini, and Danish Pruthi

In International Conference on Machine Learning, 2025

BibTeX Citation:

@inproceedings{rastogi2025stamp,
  title = {STAMP Your Content: Proving Dataset Membership via Watermarked Rephrasings},
  author = {Rastogi, Saksham and Maini, Pratyush and Pruthi, Danish},
  booktitle = {International Conference on Machine Learning},
  year = {2025},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1945546473942519854}
}

Workshop

Oral
MemFM Workshop

MAGIC: Diffusion Model Memorization Auditing via Generative Image Compression

Gunjan Dhanuka, Sumukh K. Aithal, Avi Schwarzschild, Zhili Feng, J. Zico Kolter, Zachary C. Lipton, and Pratyush Maini

In The Impact of Memorization on Trustworthy Foundation Models: ICML 2025 Workshop, 2025

BibTeX Citation:

@inproceedings{dhanuka2025magic,
  title = {MAGIC: Diffusion Model Memorization Auditing via Generative Image Compression},
  author = {Dhanuka, Gunjan and Aithal, Sumukh K. and Schwarzschild, Avi and Feng, Zhili and Kolter, J. Zico and Lipton, Zachary C. and Maini, Pratyush},
  booktitle = {The Impact of Memorization on Trustworthy Foundation Models: ICML 2025 Workshop},
  year = {2025},
  keywords = {Memorization},
  awards = {Oral (MemFM Workshop)}
}

arXiv

BeyondWeb: Lessons from Scaling Synthetic Data for Trillion-scale Pretraining

arXiv preprint arXiv:2508.10975, 2025

BibTeX Citation:

@article{maini2025beyondweb,
  title = {BeyondWeb: Lessons from Scaling Synthetic Data for Trillion-scale Pretraining},
  author = {Maini, Pratyush and Dorna, Vineeth and Doshi, Parth and Carranza, Aldo and Pan, Fan and Urbanek, Jack and Burstein, Paul and Fang, Alex and Deng, Alvin and Abbas, Amro and Larsen, Brett and Blakeney, Cody and Bannur, Charvi and Baek, Christina and Teh, Darren and Schwab, David and Mongstad, Haakon and Yin, Haoli and Wills, Josh and Mentzer, Kaleigh and Merrick, Luke and Monti, Ricardo and Adiga, Rishabh and Joshi, Siddharth and Das, Spandan and Wang, Zhengping and Gaza, Bogdan and Morcos, Ari and Leavitt, Matthew},
  journal = {arXiv preprint arXiv:2508.10975},
  year = {2025},
  keywords = {Data Curation},
  tweet = {https://x.com/pratyushmaini/status/1957456720265154752}
}

arXiv

Safety Pretraining: Toward the next generation of Safe AI

Pratyush Maini^*, Sachin Goyal^*, Dylan Sam^*, Alex Robey, Yash Savani, Yiding Jiang, Andy Zou, Matt Fredrikson, Zachary C. Lipton, and J. Zico Kolter

arXiv preprint arXiv:2504.16980, 2025

BibTeX Citation:

@article{maini2025safety,
  title = {Safety Pretraining: Toward the next generation of Safe AI},
  author = {Maini, Pratyush and Goyal, Sachin and Sam, Dylan and Robey, Alex and Savani, Yash and Jiang, Yiding and Zou, Andy and Fredrikson, Matt and Lipton, Zachary C. and Kolter, J. Zico},
  journal = {arXiv preprint arXiv:2504.16980},
  year = {2025},
  selected = {true},
  keywords = {Data Curation, Memorization},
  tldr = {A framework for training AI systems to be inherently safer through specialized pretraining.}
}

ICLR Blogpost

Peeking Behind Closed Doors: Risks of LLM Evaluation by Private Data Curators

Harshay Bansal, and Pratyush Maini

The Fourth Blogpost Track at ICLR 2025, 2025

BibTeX Citation:

@article{bansal2025peeking,
  title = {Peeking Behind Closed Doors: Risks of LLM Evaluation by Private Data Curators},
  author = {Bansal, Harshay and Maini, Pratyush},
  journal = {The Fourth Blogpost Track at ICLR 2025},
  year = {2025},
  keywords = {Data Curation},
  tweet = {https://x.com/hbXNov/status/1861826049086201970}
}

2024

ACL

Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling

Pratyush Maini, Skyler Seto, He Bai, David Grangier, Yizhe Zhang, and Navdeep Jaitly

In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Aug 2024

BibTeX Citation:

@inproceedings{maini2024rephrasing,
  title = {Rephrasing the Web: A Recipe for Compute and Data-Efficient Language Modeling},
  author = {Maini, Pratyush and Seto, Skyler and Bai, He and Grangier, David and Zhang, Yizhe and Jaitly, Navdeep},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.acl-long.757},
  pages = {14044--14072},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Paraphrasing web data into Q/A pairs significantly improves language model training efficiency.},
  tweet = {https://x.com/pratyushmaini/status/1752337225097076809}
}

ICLR Blogpost

Reassessing EMNLP 2024’s Best Paper: Does Divergence-Based Calibration for MIAs Hold Up?

Pratyush Maini, and Anshuman Suri

The Fourth Blogpost Track at ICLR 2025, Aug 2024

BibTeX Citation:

@article{maini2024reassessing,
  title = {Reassessing EMNLP 2024's Best Paper: Does Divergence-Based Calibration for MIAs Hold Up?},
  author = {Maini, Pratyush and Suri, Anshuman},
  journal = {The Fourth Blogpost Track at ICLR 2025},
  year = {2024},
  keywords = {Memorization},
  tweet = {https://x.com/pratyushmaini/status/1861471365775401022}
}

NeurIPS

Oral
Private-NLP Workshop

LLM Dataset Inference: Did you train on my dataset?

Pratyush Maini^*, Hengrui Jia^*, Nicolas Papernot, and Adam Dziedzic

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, Aug 2024

TLDR arXiv Citation PDF Tweet

BibTeX Citation:

@inproceedings{maini2024llm,
  title = {{LLM} Dataset Inference: Did you train on my dataset?},
  author = {Maini, Pratyush and Jia, Hengrui and Papernot, Nicolas and Dziedzic, Adam},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=Fr9d1UMc37},
  keywords = {Memorization},
  selected = {true},
  tldr = {Black-box detection of whether a dataset was used to train an LLM.},
  awards = {Oral (Private-NLP Workshop)},
  venues = {NeurIPS 2024; Private-NLP @ ACL 2024 (Oral)},
  tweet = {https://x.com/pratyushmaini/status/1800935108670816267}
}

NeurIPS

Best Paper
CONDA Workshop

Rethinking LLM Memorization through the Lens of Adversarial Compression

Avi Schwarzschild, Zhili Feng, Pratyush Maini, Zachary C. Lipton, and J. Zico Kolter

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, Aug 2024

BibTeX Citation:

@inproceedings{schwarzschild2024rethinking,
  title = {Rethinking {LLM} Memorization through the Lens of Adversarial Compression},
  author = {Schwarzschild, Avi and Feng, Zhili and Maini, Pratyush and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=KFmRMvzAZy},
  keywords = {Memorization},
  awards = {Best Paper (CONDA Workshop)},
  tweet = {https://x.com/pratyushmaini/status/1783572891759702442}
}

NeurIPS

Understanding Hallucinations in Diffusion Models through Mode Interpolation

Sumukh K. Aithal, Pratyush Maini, Zachary C. Lipton, and J. Zico Kolter

In The Thirty-eighth Annual Conference on Neural Information Processing Systems, Aug 2024

BibTeX Citation:

@inproceedings{aithal2024understanding,
  title = {Understanding Hallucinations in Diffusion Models through Mode Interpolation},
  author = {Aithal, Sumukh K. and Maini, Pratyush and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {The Thirty-eighth Annual Conference on Neural Information Processing Systems},
  year = {2024},
  url = {https://openreview.net/forum?id=aNTnHBkw4T},
  keywords = {Memorization},
  tweet = {https://x.com/sumukhaithal6/status/1801682911349063966}
}

COLM

Oral
Set-LLM Workshop

TOFU: A Task of Fictitious Unlearning for LLMs

Pratyush Maini^*, Zhili Feng^*, Avi Schwarzschild^*, Zachary C. Lipton, and J. Zico Kolter

In , Aug 2024

TLDR Citation Tweet Website

BibTeX Citation:

@inproceedings{tofu2024,
  title = {TOFU: A Task of Fictitious Unlearning for LLMs},
  author = {Maini, Pratyush and Feng, Zhili and Schwarzschild, Avi and Lipton, Zachary C. and Kolter, J. Zico},
  year = {2024},
  archiveprefix = {Conference on Language Modeling},
  primaryclass = {cs.LG},
  keywords = {Memorization},
  selected = {true},
  tldr = {Benchmarking machine unlearning methods for large language models.},
  awards = {Oral (Set-LLM Workshop)},
  tweet = {https://x.com/_akhaliq/status/1745643293839327268}
}

CVPR

Best Paper
DPFM Workshop

Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic

Sachin Goyal, Pratyush Maini, Zachary C. Lipton, Aditi Raghunathan, and J. Zico Kolter

In Conference on Computer Vision and Pattern Recognition, Aug 2024

BibTeX Citation:

@inproceedings{goyal2024scaling,
  title = {Scaling Laws for Data Filtering—Data Curation cannot be Compute Agnostic},
  author = {Goyal, Sachin and Maini, Pratyush and Lipton, Zachary C. and Raghunathan, Aditi and Kolter, J. Zico},
  booktitle = {Conference on Computer Vision and Pattern Recognition},
  year = {2024},
  keywords = {Data Curation},
  selected = {true},
  tldr = {Training compute-optimal models requires data filtering that scales with available compute.},
  awards = {Best Paper (DPFM Workshop)},
  tweet = {https://x.com/pratyushmaini/status/1778577153107570770, https://x.com/arankomatsuzaki/status/1778230945390133278}
}

ICLR

Contributed Oral
Datacomp Workshop

T-MARS: Improving Visual Representations by Circumventing Text Feature Learning

Pratyush Maini, Sachin Goyal, Zachary C. Lipton, J. Zico Kolter, and Aditi Raghunathan

In International Conference on Learning Representations, Aug 2024

arXiv Citation Poster Tweet Website

BibTeX Citation:

@inproceedings{maini2023tmars,
  title = {T-MARS: Improving Visual Representations by Circumventing Text Feature Learning},
  author = {Maini, Pratyush and Goyal, Sachin and Lipton, Zachary C. and Kolter, J. Zico and Raghunathan, Aditi},
  booktitle = {International Conference on Learning Representations},
  year = {2024},
  keywords = {Data Curation},
  awards = {Contributed Oral (Datacomp Workshop)},
  tweet = {https://x.com/goyalsachin007/status/1677314439236681730}
}

2023

ICML

Can Neural Network Memorization Be Localized?

Pratyush Maini, Michael C. Mozer, Hanie Sedghi, Zachary C. Lipton, J. Zico Kolter, and Chiyuan Zhang

In International Conference on Machine Learning, Aug 2023

TLDR Citation Poster Slides Tweet Website

BibTeX Citation:

@inproceedings{maini2023memorization,
  title = {Can Neural Network Memorization Be Localized?},
  author = {Maini, Pratyush and Mozer, Michael C. and Sedghi, Hanie and Lipton, Zachary C. and Kolter, J. Zico and Zhang, Chiyuan},
  booktitle = {International Conference on Machine Learning},
  year = {2023},
  keywords = {Memorization},
  selected = {true},
  tldr = {Individual neurons and layers do not solely determine what a neural network memorizes.},
  tweet = {https://x.com/pratyushmaini/status/1684611469407883264}
}

EMNLP

Model-tuning via prompts makes NLP models adversarially robust

Mrigank Raman, Pratyush Maini, J. Zico Kolter, Zachary C. Lipton, and Danish Pruthi

In Empirical Methods in Natural Language Processing, Aug 2023

arXiv Citation Poster Slides

BibTeX Citation:

@inproceedings{raman2023modeltuning,
  title = {Model-tuning via prompts makes NLP models adversarially robust},
  author = {Raman, Mrigank and Maini, Pratyush and Kolter, J. Zico and Lipton, Zachary C. and Pruthi, Danish},
  booktitle = {Empirical Methods in Natural Language Processing},
  year = {2023},
}

2022

NeurIPS

Best Paper Nominee

Oral
SCIS Workshop

Characterizing Datapoints via Second-Split Forgetting

Pratyush Maini, Saurabh Garg, Zachary C. Lipton, and J. Zico Kolter

In Advances in Neural Information Processing Systems, Aug 2022

Award Nominee

Citation Poster Slides Tweet

BibTeX Citation:

@inproceedings{maini2022characterizing,
  title = {Characterizing Datapoints via Second-Split Forgetting},
  author = {Maini, Pratyush and Garg, Saurabh and Lipton, Zachary C. and Kolter, J. Zico},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {Oh, Alice H. and Agarwal, Alekh and Belgrave, Danielle and Cho, Kyunghyun},
  year = {2022},
  url = {https://openreview.net/forum?id=yKDKNzjHg8N},
  note = {Award Nominee},
  awards = {Best Paper Nominee; Oral (SCIS Workshop)},
  keywords = {Data Curation},
  tweet = {https://x.com/pratyushmaini/status/1597594267702394882}
}

UAI

Perturbation Type Categorization for Multiple \ell_p Bounded Adversarial Robustness

Pratyush Maini, Xinyun Chen, Bo Li, and Dawn Song

In Proceedings of The 38th Uncertainty in Artificial Intelligence Conference, Aug 2022