Getting Started

This guide will help you get up and running with Sipha. We’ll cover installation, a quick start example, and an overview of the core concepts.

Installation

Add Sipha to your Cargo.toml:

[dependencies]
sipha = "0.5.0"

Or with specific features:

[dependencies]
sipha = { version = "0.5.0", features = ["diagnostics", "unicode", "backend-ll"] }

Available Features

backend-ll: Enable LL(k) parser backend (default)
backend-lr: Enable LR parser backend
backend-glr: Enable GLR parser backend (requires backend-lr)
diagnostics: Enable rich error diagnostics with miette
unicode: Enable full Unicode support for identifiers
visitor: Enable syntax tree visitor patterns
query: Enable XPath-like query API for syntax trees
tree-utils: Enable tree diffing and validation utilities

Quick Start

Let’s build a simple arithmetic expression parser step by step. This example will help you understand the core concepts.

Step 1: Define Your Syntax Kinds

First, define the tokens and non-terminals your parser will use. Sipha uses a unified SyntaxKind trait for both terminals (tokens) and non-terminals (grammar rules):

use sipha::syntax::SyntaxKind;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ArithSyntaxKind {
    // Terminals (produced by lexer)
    Number,
    Plus,
    Minus,
    Multiply,
    Divide,
    LParen,
    RParen,
    Whitespace,
    Eof,
    // Non-terminals (produced by parser)
    Expr,
    Term,
    Factor,
}

impl SyntaxKind for ArithSyntaxKind {
    fn is_terminal(self) -> bool {
        !matches!(self, ArithSyntaxKind::Expr | ArithSyntaxKind::Term | ArithSyntaxKind::Factor)
    }
    
    fn is_trivia(self) -> bool {
        matches!(self, ArithSyntaxKind::Whitespace)
    }
}

Step 2: Build a Lexer

Create a lexer to tokenize your input text:

use sipha::syntax::SyntaxKind;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ArithSyntaxKind {
    Number, Plus, Minus, Multiply, Divide, LParen, RParen, Whitespace, Eof,
    Expr, Term, Factor,
}
impl SyntaxKind for ArithSyntaxKind {
    fn is_terminal(self) -> bool { !matches!(self, ArithSyntaxKind::Expr | ArithSyntaxKind::Term | ArithSyntaxKind::Factor) }
    fn is_trivia(self) -> bool { matches!(self, ArithSyntaxKind::Whitespace) }
}
use sipha::lexer::{LexerBuilder, Pattern, CharSet};

let lexer = LexerBuilder::new()
    .token(ArithSyntaxKind::Number, Pattern::Repeat {
        pattern: Box::new(Pattern::CharClass(CharSet::digits())),
        min: 1,
        max: None,
    })
    .token(ArithSyntaxKind::Plus, Pattern::Literal("+".into()))
    .token(ArithSyntaxKind::Minus, Pattern::Literal("-".into()))
    .token(ArithSyntaxKind::Multiply, Pattern::Literal("*".into()))
    .token(ArithSyntaxKind::Divide, Pattern::Literal("/".into()))
    .token(ArithSyntaxKind::LParen, Pattern::Literal("(".into()))
    .token(ArithSyntaxKind::RParen, Pattern::Literal(")".into()))
    .token(ArithSyntaxKind::Whitespace, Pattern::Repeat {
        pattern: Box::new(Pattern::CharClass(CharSet::whitespace())),
        min: 1,
        max: None,
    })
    .trivia(ArithSyntaxKind::Whitespace)
    .build(ArithSyntaxKind::Eof, ArithSyntaxKind::Number)
    .expect("Failed to build lexer");

Step 3: Tokenize Input

use sipha::lexer::{LexerBuilder, Pattern, CharSet};
use sipha::syntax::SyntaxKind;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ArithSyntaxKind {
    Number, Plus, Minus, Multiply, Divide, LParen, RParen, Whitespace, Eof,
    Expr, Term, Factor,
}
impl SyntaxKind for ArithSyntaxKind {
    fn is_terminal(self) -> bool { !matches!(self, ArithSyntaxKind::Expr | ArithSyntaxKind::Term | ArithSyntaxKind::Factor) }
    fn is_trivia(self) -> bool { matches!(self, ArithSyntaxKind::Whitespace) }
}
let lexer = LexerBuilder::new()
    .token(ArithSyntaxKind::Number, Pattern::Repeat {
        pattern: Box::new(Pattern::CharClass(CharSet::digits())),
        min: 1, max: None,
    })
    .token(ArithSyntaxKind::Plus, Pattern::Literal("+".into()))
    .token(ArithSyntaxKind::Minus, Pattern::Literal("-".into()))
    .token(ArithSyntaxKind::Multiply, Pattern::Literal("*".into()))
    .token(ArithSyntaxKind::Divide, Pattern::Literal("/".into()))
    .token(ArithSyntaxKind::LParen, Pattern::Literal("(".into()))
    .token(ArithSyntaxKind::RParen, Pattern::Literal(")".into()))
    .token(ArithSyntaxKind::Whitespace, Pattern::Repeat {
        pattern: Box::new(Pattern::CharClass(CharSet::whitespace())),
        min: 1, max: None,
    })
    .trivia(ArithSyntaxKind::Whitespace)
    .build(ArithSyntaxKind::Eof, ArithSyntaxKind::Number)
    .expect("Failed to build lexer");
let input = "42 + 10";
let tokens = lexer.tokenize(input)
    .expect("Failed to tokenize input");

Step 4: Define Non-Terminals and Build Grammar

use sipha::syntax::SyntaxKind;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ArithSyntaxKind {
    Number, Plus, Minus, Multiply, Divide, LParen, RParen, Whitespace, Eof,
    Expr, Term, Factor,
}
impl SyntaxKind for ArithSyntaxKind {
    fn is_terminal(self) -> bool { !matches!(self, ArithSyntaxKind::Expr | ArithSyntaxKind::Term | ArithSyntaxKind::Factor) }
    fn is_trivia(self) -> bool { matches!(self, ArithSyntaxKind::Whitespace) }
}
use sipha::grammar::{GrammarBuilder, NonTerminal, Expr};
use sipha::lexer::Token as LexerToken;
use sipha::syntax::{TextRange, TextSize};
use std::convert::TryFrom;

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum ArithNonTerminal {
    Expr,
    Term,
    Factor,
}

impl NonTerminal for ArithNonTerminal {
    fn name(&self) -> &str {
        match self {
            ArithNonTerminal::Expr => "Expr",
            ArithNonTerminal::Term => "Term",
            ArithNonTerminal::Factor => "Factor",
        }
    }
}

// Helper function to create tokens with proper ranges
fn create_token(kind: ArithSyntaxKind, text: &str, offset: u32) -> LexerToken<ArithSyntaxKind> {
    let len = TextSize::from(u32::try_from(text.len()).unwrap_or(0));
    LexerToken::new(kind, text, TextRange::at(TextSize::from(offset), len))
}

// Build your grammar rules
let grammar = GrammarBuilder::new()
    .entry_point(ArithNonTerminal::Expr)
    // Simple grammar: Expr -> Number
    .rule(ArithNonTerminal::Expr, Expr::token(create_token(
        ArithSyntaxKind::Number,
        "42",
        0
    )))
    .build()
    .expect("Failed to build grammar");

Step 5: Parse!

use sipha::syntax::SyntaxKind;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ArithSyntaxKind {
    Number, Plus, Minus, Multiply, Divide, LParen, RParen, Whitespace, Eof,
    Expr, Term, Factor,
}
impl SyntaxKind for ArithSyntaxKind {
    fn is_terminal(self) -> bool { !matches!(self, ArithSyntaxKind::Expr | ArithSyntaxKind::Term | ArithSyntaxKind::Factor) }
    fn is_trivia(self) -> bool { matches!(self, ArithSyntaxKind::Whitespace) }
}
use sipha::grammar::{GrammarBuilder, NonTerminal, Expr};
use sipha::lexer::Token as LexerToken;
use sipha::syntax::{TextRange, TextSize};
use std::convert::TryFrom;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum ArithNonTerminal { Expr, Term, Factor, }
impl NonTerminal for ArithNonTerminal {
    fn name(&self) -> &str { match self { ArithNonTerminal::Expr => "Expr", ArithNonTerminal::Term => "Term", ArithNonTerminal::Factor => "Factor", } }
}
fn create_token(kind: ArithSyntaxKind, text: &str, offset: u32) -> LexerToken<ArithSyntaxKind> {
    let len = TextSize::from(u32::try_from(text.len()).unwrap_or(0));
    LexerToken::new(kind, text, TextRange::at(TextSize::from(offset), len))
}
let grammar = GrammarBuilder::new()
    .entry_point(ArithNonTerminal::Expr)
    .rule(ArithNonTerminal::Expr, Expr::token(create_token(ArithSyntaxKind::Number, "42", 0)))
    .build().expect("Failed to build grammar");
use sipha::lexer::{LexerBuilder, Pattern, CharSet};
let lexer = LexerBuilder::new()
    .token(ArithSyntaxKind::Number, Pattern::Repeat { pattern: Box::new(Pattern::CharClass(CharSet::digits())), min: 1, max: None })
    .token(ArithSyntaxKind::Plus, Pattern::Literal("+".into()))
    .token(ArithSyntaxKind::Minus, Pattern::Literal("-".into()))
    .token(ArithSyntaxKind::Multiply, Pattern::Literal("*".into()))
    .token(ArithSyntaxKind::Divide, Pattern::Literal("/".into()))
    .token(ArithSyntaxKind::LParen, Pattern::Literal("(".into()))
    .token(ArithSyntaxKind::RParen, Pattern::Literal(")".into()))
    .token(ArithSyntaxKind::Whitespace, Pattern::Repeat { pattern: Box::new(Pattern::CharClass(CharSet::whitespace())), min: 1, max: None })
    .trivia(ArithSyntaxKind::Whitespace)
    .build(ArithSyntaxKind::Eof, ArithSyntaxKind::Number).expect("Failed to build lexer");
let input = "42 + 10";
let tokens = lexer.tokenize(input).expect("Failed to tokenize input");
use sipha::backend::ll::{LlParser, LlConfig};
use sipha::backend::ParserBackend;

let config = LlConfig::default();
let mut parser = LlParser::new(&grammar, config)
    .expect("Failed to create parser");

let result = parser.parse(&tokens, ArithNonTerminal::Expr);

For a complete working example, see the Basic Arithmetic Example or check out examples/basic_arithmetic.rs in the repository.

Core Concepts Overview

Before diving deeper, here’s a quick overview of Sipha’s core concepts:

Syntax Kinds

Sipha uses a unified SyntaxKind trait for both terminals (tokens) and non-terminals (grammar rules). This design simplifies the API and allows for flexible grammar definitions.

See Syntax Kinds for more details.