Skip to content

Commit

Permalink
Merge pull request #6 from allenai/soldni/cli
Browse files Browse the repository at this point in the history
CLI for dolma
  • Loading branch information
soldni authored Jul 9, 2023
2 parents d9a341b + b38f7c8 commit f8b8e22
Show file tree
Hide file tree
Showing 60 changed files with 2,210 additions and 702 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
target: [x86_64, x86, aarch64, armv7, s390x]
target: [x86_64, x86, aarch64, armv7]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
Expand Down
4 changes: 4 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ authors:
family-names: Groeneveld
email: [email protected]
affiliation: Allen Institute for AI
- given-names: Dustin
family-names: Schwenk
email: [email protected]
affiliation: Allen Institute for AI
- given-names: Ian
family-names: Magnusson
email: [email protected]
Expand Down
30 changes: 30 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ crate-type = ["cdylib"]

[dependencies]
ahash = { version = "0.8.1", features = ["runtime-rng"] }
aws-config = { version = "0.55.0", features = [], default-features = false }
aws-config = { version = "0.55.0"}
aws-sdk-s3 = "0.25.0"
byteorder = "1"
clap = { version = "4.1.11", features = ["derive"] }
env_logger = "0.10.0"
flate2 = { version = "1.0", features = ["zlib-ng"], default-features = false }
jsonpath-rust = "0.3.0"
log = "0.4.17"
pyo3 = "0.19.0"
regex = "1.8.4"
pyo3 = { version = "0.19.0", features = ["extension-module"] }
rand = "0.8.4"
rayon = "1.7.0"
serde = {version = "1.0.160", features = ["derive"]}
Expand Down
57 changes: 57 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
UNAME := $(shell uname)

ifeq ($(UNAME), Darwin)
OS_MESSAGE := "MacOS detected"
CMAKE_SETUP := "which cmake || brew install cmake"
PROTOBUF_SETUP := "which protoc || brew install protobuf"
OPENSSL_SETUP := "which openssl || brew install openssl"
else ifeq ($(UNAME), Linux)
OS_MESSAGE := "Linux detected"
CMAKE_SETUP := "which cmake || sudo apt-get install --yes build-essential cmake"
PROTOBUF_SETUP := "which protoc || sudo apt-get install --yes protobuf-compiler"
OPENSSL_SETUP := "which openssl || sudo apt-get install --yes libssl-dev"
else
OS_MESSAGE := "Unsupported OS; please install rust, cmake, protobuf, and openssl manually"
CMAKE_SETUP := ""
PROTOBUF_SETUP := ""
OPENSSL_SETUP := ""
endif

setup:
@echo "${OS_MESSAGE}: installing..."
$(shell "${CMAKE_SETUP}")
$(shell "${PROTOBUF_SETUP}")
$(shell "${OPENSSL_SETUP}")
which cargo || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
which maturin || pip install maturin

release:
maturin build

test: setup develop setup-test test-python test-rust clean-test

test-python:
pytest -vs tests/python

test-rust:
cargo test -- --nocapture

clean-test:
rm -rf tests/work/*
aws s3 rm --recursive s3://ai2-llm/pretraining-data/tests/mixer/

setup-test:
aws s3 cp tests/data/documents.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/documents/head/0000.json.gz
aws s3 cp tests/data/pii-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/pii/head/0000.json.gz
aws s3 cp tests/data/toxicity-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/toxicity/head/0000.json.gz
aws s3 cp tests/data/sample-attributes.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/sample/head/0000.json.gz
aws s3 cp tests/data/duplicate-paragraphs.json.gz s3://ai2-llm/pretraining-data/tests/mixer/inputs/v0/attributes/duplicate_paragraphs/head/0000.json.gz
aws s3 sync tests/data/expected s3://ai2-llm/pretraining-data/tests/mixer/expected --exclude ".*" --exclude "*/.*"

develop:
maturin develop --extras=dev

style:
rustfmt --edition 2021 src/*.rs
autopep8 --in-place --recursive python/ && isort python/ && black python/
autopep8 --in-place --recursive tests/python/ && isort tests/python/ && black tests/python/
50 changes: 18 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,62 +3,48 @@
*Data to feed OLMo's Appetite*


<img alt="DOLMa logo. It's a watercolor of grape leaves with the word DOLMa in the top left." src="res/logo.png" width="256"></img>
<img alt="DOLMa logo. It's a watercolor of grape leaves with the word DOLMa in the top left." src="https://github.com/allenai/dolma/blob/main/res/logo.png?raw=true" width="256"></img>

Data and tools for generating and inspecting OLMo pre-training data.


## Setup

Install Rust
```
curl https://sh.rustup.rs -sSf | sh
```

Install [CMake](https://cmake.org/install/)

* On **Mac OSX** with `brew install cmake`
* On **Linux** with `apt-get install cmake`


Install [OpenSSL](https://www.openssl.org/)

* On **Mac OSX** with `brew install openssl re2`
* On **Linux** with `apt-get install openssl`

Install [Protobuf]()
Create a conda environment with Python >= 3.8. In this case, we use Python 3.10 and use Anaconda to create the environment.

* On **Mac OSX** with `brew install protobuf`
* On **Linux** with `apt-get install protobuf-compiler`

Setting up Python
```
```shell
conda create -n dolma python=3.10
```

After creating the environment, activate it and install necessary tools using the included makefile.

Install [Maturin](https://www.maturin.rs/)

```
pip install maturin
maturin develop
```shell
conda activate dolma
make setup
```

Finally, to begin development, install the repository in editable mode using maturin.

Installing this repository
```shell
make develop
```
cd dolma
pip install -e .

To run tests, use the following command.

```shell
make test
```

You can choose to run just the Python or Rust tests by calling `make test-python` or `make test-rust` respectively.


## Citation

If you use this repository, please cite it as:

```bibtex
@software{dolma,
author = {{Soldaini, Luca and Lo, Kyle and Kinney, Rodney and Naik, Aakanksha and Ravichander, Abhilasha and Bhagia, Akshita and Groeneveld, Dirk and Magnusson, Ian and Chandu, Khyathi}},
author = {{Soldaini, Luca and Lo, Kyle and Kinney, Rodney and Naik, Aakanksha and Ravichander, Abhilasha and Bhagia, Akshita and Groeneveld, Dirk and Schwenk, Dustin and Magnusson, Ian and Chandu, Khyathi}},
license = {{Apache-2.0}},
title = {{DOLMa}},
url = {https://github.com/allenai/dolma}
Expand Down
14 changes: 10 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ dependencies = [
"boto3",
"cached-path==1.3.4",
"msgspec>=0.14.2",
"smashed[remote]>=0.20.0",
"presidio_analyzer==2.2.32",
"pycld2==0.41",
# "pycld3==0.22",
Expand All @@ -23,12 +22,14 @@ dependencies = [
"pyyaml",
"blingfire==0.1.8",
"detect-secrets==1.4.0",
"termcolor==2.3.0",
"rich>=10.12.0",
"smart-open>=6.3.0",
"nltk==3.8.1"
"nltk==3.8.1",
"fsspec>=2021.10.0",
"s3fs>=2021.10.0",
]
classifiers = [
"Development Status :: 3 - Veta",
"Development Status :: 3 - Alpha",
"Typing :: Typed",
"Programming Language :: Rust",
"Programming Language :: Python :: Implementation :: CPython",
Expand Down Expand Up @@ -67,6 +68,10 @@ email = "[email protected]"
name = "Dirk Groeneveld"
email = "[email protected]"

[[project.authors]]
name = "Dustin Schwenk"
email = "[email protected]"

[[project.authors]]
name = "Ian Magnusson"
email = "[email protected]"
Expand Down Expand Up @@ -98,6 +103,7 @@ dev = [
"ipdb>=0.13.0",
"flake8-pyi>=22.8.1",
"Flake8-pyproject>=1.1.0",
"awscli>=1.16.0",
]
[build-system]
requires = [
Expand Down
11 changes: 10 additions & 1 deletion python/dolma/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import json
import warnings

from . import dolma as _dolma # type: ignore
# warning raised by pkg_resources used in a lot of google packages
warnings.filterwarnings("ignore", message=r".*declare_namespace\(\'.*google.*", category=DeprecationWarning)
# base warning raised when warning above are raised
warnings.filterwarnings("ignore", message=r".*pkg_resources is deprecated.*", category=DeprecationWarning)

# must import taggers to register them
# we import the rust extension here and wrap it in a python module
from . import dolma as _dolma # type: ignore # noqa: E402
from . import taggers # noqa: E402


def deduper(config: dict):
Expand Down
Loading

0 comments on commit f8b8e22

Please sign in to comment.