Skip to content

Reference

CODON_TRANSLATION_TABLE = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X', 'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'} module-attribute

Table to translate between nucleotide triplets and amino acids.

START_CODONS = ['TTG', 'CTG', 'ATG'] module-attribute

List of start codons

STOP_CODONS = ['TAA', 'TAG', 'TGA'] module-attribute

List of stop codons

CDSNode

Bases: Node

Node type describing a coding sequence feature.

Source code in src/geffa/geffa.py
class CDSNode(Node):
    """Node type describing a coding sequence feature."""
    type: str = 'CDS'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the CDS feature."""
        if self.phase not in [0, 1, 2]:
            raise ValueError('Phase needs to be 0, 1 or 2 for CDS.')

        if len(self.parents) != 1:
            raise ValueError('CDS needs exactly one parent')

        if self.parents[0].type != 'mRNA':
            raise ValueError('CDS parent needs to be mRNA')

    def extend_coordinates(
        self,
        new_start: int | None = None,
        new_end: int | None = None
    ):
        """Extend the start and stop coordinates of the feature to the given
        coordinates.

        Crucially, this only allows extending, a feature cannot be shrunk this
        way (i.e. the new start cannot be to the right of the original).
        It also extends the parent's coordinates.

        Args:
            new_start: New start coordinate (default `None`).
            new_end: New end coordintate (default `None`).
        """
        super().extend_coordinates(new_start, new_end)
        self.parents[0].extend_coordinates(new_start, new_end)

extend_coordinates(new_start=None, new_end=None)

Extend the start and stop coordinates of the feature to the given coordinates.

Crucially, this only allows extending, a feature cannot be shrunk this way (i.e. the new start cannot be to the right of the original). It also extends the parent's coordinates.

Parameters:

Name Type Description Default
new_start int | None

New start coordinate (default None).

None
new_end int | None

New end coordintate (default None).

None
Source code in src/geffa/geffa.py
def extend_coordinates(
    self,
    new_start: int | None = None,
    new_end: int | None = None
):
    """Extend the start and stop coordinates of the feature to the given
    coordinates.

    Crucially, this only allows extending, a feature cannot be shrunk this
    way (i.e. the new start cannot be to the right of the original).
    It also extends the parent's coordinates.

    Args:
        new_start: New start coordinate (default `None`).
        new_end: New end coordintate (default `None`).
    """
    super().extend_coordinates(new_start, new_end)
    self.parents[0].extend_coordinates(new_start, new_end)

validate()

Validate the CDS feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the CDS feature."""
    if self.phase not in [0, 1, 2]:
        raise ValueError('Phase needs to be 0, 1 or 2 for CDS.')

    if len(self.parents) != 1:
        raise ValueError('CDS needs exactly one parent')

    if self.parents[0].type != 'mRNA':
        raise ValueError('CDS parent needs to be mRNA')

CDSOverlap

Bases: Issue

Invoked when two CDSs are overlapping. No automatic fix available.

Source code in src/geffa/geffa.py
class CDSOverlap(Issue):
    """Invoked when two CDSs are overlapping. No automatic fix available."""
    def __init__(self, node, other):
        super().__init__(node)
        self.other = other

    def __message__(self):
        return f'CDS overlaps another CDS {self.other.attributes["ID"]}.'

CDSPhaseIncorrectIssue

Bases: Issue

Invoked when the recorded CDS phase differs from the calculated phase.

Source code in src/geffa/geffa.py
class CDSPhaseIncorrectIssue(Issue):
    """Invoked when the recorded CDS phase differs from the calculated phase.
    """
    def __init__(self, node, correct_phase) -> None:
        super().__init__(node)
        self.correct_phase = correct_phase

    def fix(self) -> None:
        logger.debug('')
        self.node.phase = self.correct_phase
        self._debug_log_msg('Corrected phase of CDS feature.')
        raise RevalidationNecessary()

    def __message__(self) -> str:
        if 'pseudogene' in self.node.parents[0].parents[0].attributes:
            can_be_ignored: str = (
                ' (can be ignored - parent gene is pseudogene)')
        else:
            can_be_ignored = ''
        return (
            f'CDS phase ({self.node.phase}) should be '
            f'{self.correct_phase}{can_be_ignored}.'
        )

CodingSequenceContainsInternalStopCodons

Bases: Issue

Invoked when a CDS contains internal stop codons.

When fixing, the algorithm truncates the CDS to the first internal stop codon, unless the CDS length would shrink by more than 10%. In that case, the gene is marked as a pseudogene.

Source code in src/geffa/geffa.py
class CodingSequenceContainsInternalStopCodons(Issue):
    """Invoked when a CDS contains internal stop codons.

    When fixing, the algorithm truncates the CDS to the first internal stop
    codon, unless the CDS length would shrink by more than 10%.
    In that case, the gene is marked as a pseudogene."""
    def fix(self):
        coding_sequence = self.node.coding_sequence()
        stop_codon_presence = [
            str(coding_sequence[i:i+3]) in STOP_CODONS
            for i in range(0, len(coding_sequence)-3, 3)]
        if not any(stop_codon_presence):
            raise RuntimeError(
                'Coding sequence is supposed to have an internal stop, but '
                'cannot find it.')
        first_stop_index = stop_codon_presence.index(True)

        old_length = len(coding_sequence)
        new_length = first_stop_index*3+3
        if new_length/old_length < 0.9:
            self._debug_log_msg(
                'Marking as pseudo since length would be truncated more than '
                '10% by first internal stop codon.')
            self.node.parents[0].mark_as_pseudo()
            raise RevalidationNecessary()

        CDSs = self.node.CDS_children()
        if len(CDSs) > 1:
            self._debug_log_msg(
                'Cannot truncate coding sequence with multiple CDSs (needs to '
                'be implemented).')
            raise NotImplementedError
        CDS = CDSs[-1]
        self._debug_log_msg('Truncating coding sequence to first stop codon.')
        if self.node.strand == '+':
            CDS.end = CDS.start + 3*first_stop_index + 2
            raise RevalidationNecessary()
        else:
            CDS.start = CDS.end - 3*first_stop_index + 1
            raise RevalidationNecessary()

    def __message__(self):
        return 'Coding sequence contains internal stop codons.'

CodingSequenceEndIsNotStopCodon

Bases: Issue

Invoked when a CDS does not end with a stop codon.

When fixing, the algorithm tries to find a stop codon within 30nt of the end of the CDS. If that cannot be found, the gene will be marked as a pseudogene, unless the CDS contains internal stop codons.

Source code in src/geffa/geffa.py
class CodingSequenceEndIsNotStopCodon(Issue):
    """Invoked when a CDS does not end with a stop codon.

    When fixing, the algorithm tries to find a stop codon within 30nt of the
    end of the CDS. If that cannot be found, the gene will be marked as a
    pseudogene, unless the CDS contains internal stop codons."""
    def __message__(self) -> str:
        return 'Coding sequence does not end with a stop codon.'

    def fix(self) -> None:
        mRNA = self.node
        contig_seq: Seq = mRNA.sequence_region.sequence

        CDSs = mRNA.CDS_children()
        last_CDS = CDSs[-1]
        length = last_CDS.end - last_CDS.start + 1
        codon_aligned_length = 3*(length//3) + last_CDS.phase
        if mRNA.strand == '+':
            end = last_CDS.start + codon_aligned_length - 1
            adjacent_seq = str(contig_seq[end:min(len(contig_seq), end+30*3)])
        else:
            start = last_CDS.end - codon_aligned_length + 1
            adjacent_seq = str(
                contig_seq[max(0, start-30*3-1):start-1].reverse_complement())
        adjacent_codons = [
            adjacent_seq[i:i+3] for i in range(0, len(adjacent_seq), 3)]
        for stop_codon in STOP_CODONS:
            try:
                index = adjacent_codons.index(stop_codon)
            except ValueError:
                continue
            # Found a stop codon within 30 codons of the end - adjusting end.
            if mRNA.strand == '+':
                last_CDS.extend_coordinates(
                    new_end=last_CDS.end + 3*index+3)
            else:
                last_CDS.extend_coordinates(
                    new_start=last_CDS.start - 3*index-3)
            self._debug_log_msg(
                'Extended coding sequence to include next stop codon.')
            raise RevalidationNecessary()
        else:
            # Check if we have an issue with internal stop codons first, if
            # not mark as pseudo
            if not any([
                issubclass(
                    type(issue), CodingSequenceContainsInternalStopCodons)
                for issue in self.node.issues
            ]):
                self._debug_log_msg(
                    'No stop codon found, marking gene as pseudo.')
                mRNA.parents[0].mark_as_pseudo()
            else:
                self._debug_log_msg(
                    'No stop codon found, but holding off on marking gene as '
                    'pseudo because there are internal stop codons present.'
                )

CodingSequenceStartIsNotStartCodon

Bases: Issue

Invoked when a CDS does not start with a start codon.

The fix is to mark the parent gene as a pseudogene.

Source code in src/geffa/geffa.py
class CodingSequenceStartIsNotStartCodon(Issue):
    """Invoked when a CDS does not start with a start codon.

    The fix is to mark the parent gene as a pseudogene."""
    def __message__(self) -> str:
        return 'Coding sequence does not start with a start codon.'

    def fix(self):
        self.node.parents[0].mark_as_pseudo()
        self._debug_log_msg('Marked gene as a pseudogene.')
        raise RevalidationNecessary()

ExonDoesNotContainCDSCompletely

Bases: Issue

Invoked when a CDS extends beyond its containing exon.

When fixing, the exon's coordinates are extended to contain the CDS fully.

Source code in src/geffa/geffa.py
class ExonDoesNotContainCDSCompletely(Issue):
    """Invoked when a CDS extends beyond its containing exon.

    When fixing, the exon's coordinates are extended to contain the CDS fully.
    """
    def __init__(self, node, CDS):
        super().__init__(node)
        self.CDS = CDS

    def fix(self):
        exon = self.node
        CDS = self.CDS

        new_start = min(exon.start, CDS.start)
        new_end = max(exon.end, CDS.end)
        exon.extend_coordinates(new_start=new_start, new_end=new_end)
        self._debug_log_msg(
            'Extended exon coordinates to contain CDS '
            f'{CDS.attributes["ID"]}.')
        raise RevalidationNecessary()

    def __message__(self):
        return 'CDS extends beyond the range of the exon.'

ExonNode

Bases: Node

Node type describing an exon.

Source code in src/geffa/geffa.py
class ExonNode(Node):
    """Node type describing an exon."""
    type: str = 'exon'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the exon feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for exon.')

        for p in self.parents:
            if p.type not in ['mRNA', 'ncRNA', 'rRNA', 'tRNA']:
                raise ValueError('Exon parent needs to be an RNA')

    def extend_coordinates(
        self,
        new_start: int | None = None,
        new_end: int | None = None
    ):
        """Extend the start and stop coordinates of the feature to the given
        coordinates.

        Crucially, this only allows extending, a feature cannot be shrunk this
        way (i.e. the new start cannot be to the right of the original).
        It also extends the parent's coordinates.

        Args:
            new_start: New start coordinate (default `None`).
            new_end: New end coordintate (default `None`).
        """
        super().extend_coordinates(new_start, new_end)
        self.parents[0].extend_coordinates(new_start, new_end)

extend_coordinates(new_start=None, new_end=None)

Extend the start and stop coordinates of the feature to the given coordinates.

Crucially, this only allows extending, a feature cannot be shrunk this way (i.e. the new start cannot be to the right of the original). It also extends the parent's coordinates.

Parameters:

Name Type Description Default
new_start int | None

New start coordinate (default None).

None
new_end int | None

New end coordintate (default None).

None
Source code in src/geffa/geffa.py
def extend_coordinates(
    self,
    new_start: int | None = None,
    new_end: int | None = None
):
    """Extend the start and stop coordinates of the feature to the given
    coordinates.

    Crucially, this only allows extending, a feature cannot be shrunk this
    way (i.e. the new start cannot be to the right of the original).
    It also extends the parent's coordinates.

    Args:
        new_start: New start coordinate (default `None`).
        new_end: New end coordintate (default `None`).
    """
    super().extend_coordinates(new_start, new_end)
    self.parents[0].extend_coordinates(new_start, new_end)

validate()

Validate the exon feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the exon feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for exon.')

    for p in self.parents:
        if p.type not in ['mRNA', 'ncRNA', 'rRNA', 'tRNA']:
            raise ValueError('Exon parent needs to be an RNA')

FeatureEndRightOfParentIssue

Bases: Issue

Invoked when a gene or gene feature has an end index beyong the end of its parent.

Source code in src/geffa/geffa.py
class FeatureEndRightOfParentIssue(Issue):
    """Invoked when a gene or gene feature has an end index beyong the end of
    its parent."""
    def __init__(self, node, parent) -> None:
        super().__init__(node)
        self.parent = parent

    def fix(self) -> None:
        self.parent.end = self.node.end
        self._debug_log_msg('Extended end of parent feature.')
        raise RevalidationNecessary()

    def __message__(self) -> str:
        return (
            f'Feature end ({self.node.end}) is to the right of one of its '
            f'parents ({self.parent.end}).'
        )

FeatureEndRightOfSequenceRegionEnd

Bases: Issue

Invoked when a gene or gene feature has an end index beyond the end of the sequence region it belongs to.

Source code in src/geffa/geffa.py
class FeatureEndRightOfSequenceRegionEnd(Issue):
    """Invoked when a gene or gene feature has an end index beyond the end of
    the sequence region it belongs to."""
    def fix(self) -> None:
        self.node.end = self.node.sequence_region.end
        logger.debug('Truncating gene to the end of the sequence region.')
        raise RevalidationNecessary()

    def __message__(self) -> str:
        return (
            f'Feature end ({self.node.end}) is to the right of the end of '
            f'the containing sequence region ({self.node.sequence_region.name}'
            f': {self.node.sequence_region.end}'
        )

FeatureStartLeftOfParentIssue

Bases: Issue

Invoked when a gene or gene feature has a start index smaller than the start of its parent.

Source code in src/geffa/geffa.py
class FeatureStartLeftOfParentIssue(Issue):
    """Invoked when a gene or gene feature has a start index smaller than the
    start of its parent."""
    def __init__(self, node, parent) -> None:
        super().__init__(node)
        self.parent = parent

    def fix(self) -> None:
        self.parent.start = self.node.start
        self._debug_log_msg('Extended start of parent feature.')
        raise RevalidationNecessary()

    def __message__(self) -> str:
        return (
            f'Feature start ({self.node.start}) is to the left of one of its '
            f'parents ({self.parent.start}).'
        )

FeatureStartLeftOfSequenceRegionStart

Bases: Issue

Invoked when a gene or gene feature has a start index smaller than the start of the sequence region it belongs to.

Source code in src/geffa/geffa.py
class FeatureStartLeftOfSequenceRegionStart(Issue):
    """Invoked when a gene or gene feature has a start index smaller than the
    start of the sequence region it belongs to."""
    def fix(self) -> None:
        self.node.start = self.node.sequence_region.start
        logger.debug('Truncating gene to the start of the sequence region.')
        raise RevalidationNecessary()

    def __message__(self) -> str:
        return (
            f'Feature start ({self.node.start}) is to the left of the start '
            'of the containing sequence region '
            f'({self.node.sequence_region.name}: '
            f'{self.node.sequence_region.start}'
        )

FivePrimeUTRNode

Bases: Node

Node type describing a 5' untranslated region (UTR).

Source code in src/geffa/geffa.py
class FivePrimeUTRNode(Node):
    """Node type describing a 5' untranslated region (UTR)."""
    type: str = 'five_prime_UTR'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the 5'UTR feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for five_prime_UTR.')

        for p in self.parents:
            if p.type != 'mRNA':
                raise ValueError('five_prime_UTR parent needs to be mRNA')

validate()

Validate the 5'UTR feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the 5'UTR feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for five_prime_UTR.')

    for p in self.parents:
        if p.type != 'mRNA':
            raise ValueError('five_prime_UTR parent needs to be mRNA')

GeneNode

Bases: Node

Node type describing a gene feature.

Source code in src/geffa/geffa.py
class GeneNode(Node):
    """Node type describing a gene feature."""
    type: str = 'gene'
    toplevel: bool = True

    def validate(self) -> None:
        """Validate the gene node."""
        if self.type != 'gene':
            raise ValueError(f'GeneNode called with wrong type "{self.type}"')
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for genes.')
        if 'Parent' in self.attributes:
            raise ValueError(
                '"Parent" attribute does not make sense for gene.')

    def mark_as_pseudo(self, reason: str = 'unknown') -> None:
        """This marks the gene feature as a pseudo gene.

        It will delete any CDS, 3'UTR and 5'UTR features found in any mRNA
        child features.

        Args:
            reason: Text describing the reason for marking as a pseudogene
                (default "unknown").
        """
        self.attributes['pseudogene'] = reason
        for child in self.children:
            if child.type == 'mRNA':
                for grandchild in list(child.children):
                    if grandchild.type in [
                        'CDS', 'three_prime_UTR', 'five_prime_UTR'
                    ]:
                        grandchild.delete()

mark_as_pseudo(reason='unknown')

This marks the gene feature as a pseudo gene.

It will delete any CDS, 3'UTR and 5'UTR features found in any mRNA child features.

Parameters:

Name Type Description Default
reason str

Text describing the reason for marking as a pseudogene (default "unknown").

'unknown'
Source code in src/geffa/geffa.py
def mark_as_pseudo(self, reason: str = 'unknown') -> None:
    """This marks the gene feature as a pseudo gene.

    It will delete any CDS, 3'UTR and 5'UTR features found in any mRNA
    child features.

    Args:
        reason: Text describing the reason for marking as a pseudogene
            (default "unknown").
    """
    self.attributes['pseudogene'] = reason
    for child in self.children:
        if child.type == 'mRNA':
            for grandchild in list(child.children):
                if grandchild.type in [
                    'CDS', 'three_prime_UTR', 'five_prime_UTR'
                ]:
                    grandchild.delete()

validate()

Validate the gene node.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the gene node."""
    if self.type != 'gene':
        raise ValueError(f'GeneNode called with wrong type "{self.type}"')
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for genes.')
    if 'Parent' in self.attributes:
        raise ValueError(
            '"Parent" attribute does not make sense for gene.')

GenericNode

Bases: Node

A catch-all node type used if ignore_unknown_feature_types=True.

Source code in src/geffa/geffa.py
class GenericNode(Node):
    """A catch-all node type used if `ignore_unknown_feature_types=True`."""
    type: str = '__generic__'
    # The node can be top level.
    toplevel: bool = True
    # This is only used if `ignore_unknown_feature_types` is enabled

    def __init__(
        self,
        line_nr,
        sequence_region,
        source,
        entry_type,
        start,
        end,
        score,
        strand,
        phase,
        attributes,
        *args, **kwargs
    ):
        self.type = entry_type
        super().__init__(
            line_nr,
            sequence_region,
            source,
            entry_type,
            start,
            end,
            score,
            strand,
            phase,
            attributes,
            *args, **kwargs
        )

    def validate(self) -> None:
        """Stub function, cannot actually validate a generic node since we
        don't know anything about it."""
        pass

validate()

Stub function, cannot actually validate a generic node since we don't know anything about it.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Stub function, cannot actually validate a generic node since we
    don't know anything about it."""
    pass

GffFile

Describes a GFF file.

Attributes:

Name Type Description
sequence_regions dict[str, SequenceRegion]

A dict containing the sequence regions stored within the GFF file.

Source code in src/geffa/geffa.py
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
class GffFile:
    """Describes a GFF file.

    Attributes:
        sequence_regions: A `dict` containing the sequence regions stored
            within the GFF file.
    """

    sequence_regions: dict[str, SequenceRegion]

    def __init__(
        self,
        gff_file: str | None = None,
        fasta_file: str | None = None,
        postpone_validation: bool = True,
        ignore_unknown_feature_types: bool = False
    ) -> None:
        """Initialize a GFF file object.

        If `gff_file` is `None`, creates an empty GFF file, that can
        optionally be populated with sequences from `fasta_file`.

        Args:
            gff_file: File name of the GFF file (default `None`).
            fasta_file: File name of a FASTA file containing the contig
                sequences (default `None`)
            postpone_validation: Whether to postpone running GFF validation,
                will run validation directly after loading if `False`
                (default `True`).
            ignore_unknown_feature_types: Raise an exception if an unknown
                feature type is encountered if `False` (default `False`).
        """
        if fasta_file is not None:
            self._contigs = _read_fasta_file(fasta_file)
        else:
            self._contigs = {}

        if gff_file is not None:
            self._read_gff_file(gff_file, ignore_unknown_feature_types)
        else:
            self.sequence_regions = self._generate_seqregs()

        if not postpone_validation:
            self.validate()

    def _generate_seqregs(self) -> dict[str, SequenceRegion]:
        # Generate empty sequence regions from the given contigs.
        return {
            name: SequenceRegion(name, 1, len(sequence)+1, sequence)
            for name, sequence in self._contigs.items()
        }

    def _read_gff_file(self, filename, ignore_unknown_feature_types) -> None:
        # Parse GFF file
        header_lines: list[str] = []
        GFF_lines: list[str] = []

        # Read GFF file line by line
        with open(filename, 'r') as f:
            f: TextIO
            # Read in header (until we encounter a line not starting with "##")
            for line in f:
                if not line.startswith('##'):
                    break
                header_lines.append(line)

            # Return to the start of the file.
            f.seek(0)
            read_fasta: bool = False
            for line_nr, line in enumerate(f):
                # If we encounter a line starting with "##FASTA", stop reading
                # GFF lines and treat the rest as a FASTA file.
                if line.startswith('##FASTA'):
                    read_fasta = True
                    break
                # Ignore comments
                if line.startswith('#'):
                    continue
                # Record line
                GFF_lines.append((line_nr, line))
            if read_fasta:
                if len(self._contigs) > 0:
                    # If we already read in an external FASTA, use that, but
                    # warn the user.
                    logger.warning(
                        'External FASTA file provided, but GFF contains FASTA '
                        'section. Using the external data.'
                    )
                else:
                    # If the file has a FASTA portion at the end, use that.
                    self._contigs.update(_parse_fasta(f.read()))

        # New sequence regions
        seqregs: dict[str, SequenceRegion] = self._generate_seqregs()
        # The header contains info on the sequence regions (among other
        # things, but we only use the sequence region info)
        # This is formatted this way: "##sequence-region <name> <start> <stop>"
        for line in header_lines:
            split: list[str] = re.split(r'\s+', line)
            if split[0] == '##sequence-region':
                name: str
                start: str
                end: str
                name, start, end = split[1:-1]
                if name in seqregs:
                    seqreg: SequenceRegion = seqregs[name]
                    if (
                        (int(start) != seqreg.start) or
                        (int(end) != seqreg.end)
                    ):
                        ValueError(
                            'FASTA and GFF disagree on start or end of '
                            f'sequence region {name} - start {start} vs '
                            f'{seqreg.start} and end {end} vs {seqreg.end}'
                        )
                else:
                    seqregs[split[1]] = SequenceRegion(
                        split[1], int(split[2]), int(split[3]), None)

        # Parse the body of the GFF
        for line_nr, line in GFF_lines:
            # Each line is formatted this way:
            # <sequence ID> <source> <type> <start> <end> <score> <strand> <phase> <attributes>. # noqa: E501
            # We split the line on whitespace, and make sure that we have nine
            # components
            splits: list[str] = re.split(r'\s+', line[:-1])
            if len(splits) < 9:
                raise ValueError(f'Malformatted line on line nr {line_nr}.')
            elif len(splits) > 9:
                # We need to treat the last entry in a special way, because
                # the attributes can contain whitespace.
                last_entry: str = ' '.join(splits[8:])
                splits = list(splits[:8]) + [last_entry]

            _: str
            seq_id: str
            entry_type: str
            seq_id, _, entry_type, start, end, _, _, _, _ = splits

            try:
                seqreg = seqregs[seq_id]
            except KeyError:
                raise ValueError(
                    f'Unknown sequence region ID on line nr {line_nr}.')

            # There are GFF files with non-spec feature types, such as
            # "protein_coding_gene". We treat them as a gene feature, but
            # remember the non-standard type for saving.
            # TODO: We should treat this as a validation / fix issue.
            if entry_type in EXTRA_FEATURE_TYPES:
                extra_feature_type = entry_type
                entry_type = EXTRA_FEATURE_TYPES[entry_type]
            else:
                extra_feature_type = None

            try:
                # Generate a new node for the GFF entry.
                # This raises a StopIteration exception if it finds a node
                # type it doesn't know.
                node_type = next(
                    subclass for subclass in Node.__subclasses__()
                    if subclass.type == entry_type
                )
                node_type(
                    line_nr+1,
                    seqreg,
                    splits[1],
                    entry_type,
                    *splits[3:],
                    extra_feature_type=extra_feature_type
                )
            except StopIteration:
                # Produce a GenericNode if instructed to ignore unknown
                # feature types, otherwise raise an Exception.
                if ignore_unknown_feature_types:
                    GenericNode(line_nr+1, seqreg, *splits[1:])
                else:
                    raise ValueError(
                        f'Unknown feature type {entry_type} on line nr'
                        f'{line_nr}.'
                    )
            except Exception:
                raise Exception(f'Exception raised on line nr {line_nr}.')

        self.sequence_regions = seqregs

    def validate(self) -> None:
        """Validate the GFF file, its sequence regions and the contained
        feature nodes."""
        for seqreg in self.sequence_regions.values():
            seqreg.validate()

    def fix_issues(self) -> None:
        """Recursively fix any issues found during validation that can be
        fixed automatically.

        This needs to be run after `GFFFile.validate`, otherwise no issues
        have been populated.
        """
        def fix_issues_recursively(node) -> None:
            # Issues are fixed depth-first.
            for child in node.children:
                fix_issues_recursively(child)
            while node.issues:
                issue: Issue = node.issues[0]
                del node.issues[0]
                logger.debug(
                    f'{issue.node.attributes["ID"]}: Fixing issue of type '
                    f'{type(issue).__name__}'
                )
                try:
                    issue.fix()
                except NotImplementedError:
                    logger.debug(
                        f'{issue.node.attributes["ID"]}: Unable to fix.')

        # Go through all sequence regions and run fixes.
        for seqreg in self.sequence_regions.values():
            for gene in seqreg.nodes_of_type(GeneNode):
                while True:
                    try:
                        fix_issues_recursively(gene)
                    except RevalidationNecessary:
                        # On a gene-level, revalidation after a fix might be
                        # necessary.
                        logger.debug(
                            f'{gene.attributes["ID"]}: Revalidating after fix')
                        gene._validate()
                        continue
                    break

    def gather_issues(self) -> dict[str, list[Issue]]:
        """Collect issues from all feature nodes. Returns a dict with the
        issue type as key and a list of occurrences as the value."""
        issues: defaultdict[str, list[Issue]] = defaultdict(list)
        for seqreg in self.sequence_regions.values():
            seqreg.validate()
            for node in seqreg.node_registry.values():
                for issue in node.issues:
                    issues[type(issue).__name__].append(issue)
        return dict(issues)

    def validation_report(self) -> str:
        """Return a report on issues found as a string."""
        issues: dict[str, list[Issue]] = self.gather_issues()

        report: str = 'Issue summary:\n'
        if len(issues) == 0:
            report += 'No issues found.\n'
        else:
            for issue_type in sorted(issues):
                report += f' * {issue_type}: {len(issues[issue_type])} found\n'

            for issue_type in sorted(issues):
                report += f'\n#### {issue_type} ####\n'
                for issue in issues[issue_type]:
                    report += str(issue) + '\n'

        return report

    def merge(self, gff: GffFile):
        """Merge another GFF file into this one."""
        srnames_this: set[str] = set(self.sequence_regions)
        srnames_other: set[str] = set(gff.sequence_regions)
        if srnames_this.intersection(srnames_other):
            raise ValueError(
                'Unable to merge because the given GffFile object contains '
                'sequence region names that overlap with existing sequence '
                'region names.'
            )
        self._contigs.update(gff._contigs)
        self.sequence_regions.update(gff.sequence_regions)

    def __getitem__(self, key: str) -> Node:
        """Return the feature node with the given ID."""
        for seqreg in self.sequence_regions.values():
            try:
                return seqreg.node_registry[key]
            except KeyError:
                pass
        raise KeyError(f'No feature with ID "{key}" found.')

    def _sequence_region_header(self, skip_empty_sequences=True) -> str:
        # Construct the header lines for all sequence regions
        header: str = ''
        for seqreg in self.sequence_regions.values():
            if skip_empty_sequences and (len(seqreg.node_registry) == 0):
                continue
            header += str(seqreg) + '\n'
        return header

    def __str__(self) -> str:
        """Format and return the GFF file as a string."""
        header: str = '''##gff-version    3
##feature-ontology  so.obo
##attribute-ontology    gff3_attributes.obo
'''
        header += self._sequence_region_header()
        body: str = ''
        for _, seqreg in sorted(
            self.sequence_regions.items(), key=lambda x: x[0]
        ):
            for feature in sorted(
                (
                    entry for entry in seqreg.node_registry.values()
                    if entry.toplevel and not entry.parents
                ),
                key=lambda x: int(x.start)
            ):
                body += '###\n' + str(feature) + '\n'
        return header + body

    def save(self, filename: str, include_sequences: bool = False):
        """Save the GFF file.

        Args:
            filename: Name of the output file.
            include_sequences: Whether to append the sequences as a FASTA
                portion (default `False)
        """
        with open(filename, 'w') as f:
            f.write(str(self))
            if include_sequences:
                f.write('##FASTA\n')
                for seqreg in self.sequence_regions.values():
                    if (
                        (len(seqreg.node_registry) == 0) or
                        (seqreg.sequence is None)
                    ):
                        continue
                    seq: str = str(seqreg.sequence)
                    f.write(f'>{seqreg.name}\n')
                    for i in range(0, len(seq)+1, 80):
                        f.write(seq[i:i+80] + '\n')

__getitem__(key)

Return the feature node with the given ID.

Source code in src/geffa/geffa.py
def __getitem__(self, key: str) -> Node:
    """Return the feature node with the given ID."""
    for seqreg in self.sequence_regions.values():
        try:
            return seqreg.node_registry[key]
        except KeyError:
            pass
    raise KeyError(f'No feature with ID "{key}" found.')

__init__(gff_file=None, fasta_file=None, postpone_validation=True, ignore_unknown_feature_types=False)

Initialize a GFF file object.

If gff_file is None, creates an empty GFF file, that can optionally be populated with sequences from fasta_file.

Parameters:

Name Type Description Default
gff_file str | None

File name of the GFF file (default None).

None
fasta_file str | None

File name of a FASTA file containing the contig sequences (default None)

None
postpone_validation bool

Whether to postpone running GFF validation, will run validation directly after loading if False (default True).

True
ignore_unknown_feature_types bool

Raise an exception if an unknown feature type is encountered if False (default False).

False
Source code in src/geffa/geffa.py
def __init__(
    self,
    gff_file: str | None = None,
    fasta_file: str | None = None,
    postpone_validation: bool = True,
    ignore_unknown_feature_types: bool = False
) -> None:
    """Initialize a GFF file object.

    If `gff_file` is `None`, creates an empty GFF file, that can
    optionally be populated with sequences from `fasta_file`.

    Args:
        gff_file: File name of the GFF file (default `None`).
        fasta_file: File name of a FASTA file containing the contig
            sequences (default `None`)
        postpone_validation: Whether to postpone running GFF validation,
            will run validation directly after loading if `False`
            (default `True`).
        ignore_unknown_feature_types: Raise an exception if an unknown
            feature type is encountered if `False` (default `False`).
    """
    if fasta_file is not None:
        self._contigs = _read_fasta_file(fasta_file)
    else:
        self._contigs = {}

    if gff_file is not None:
        self._read_gff_file(gff_file, ignore_unknown_feature_types)
    else:
        self.sequence_regions = self._generate_seqregs()

    if not postpone_validation:
        self.validate()

__str__()

Format and return the GFF file as a string.

Source code in src/geffa/geffa.py
    def __str__(self) -> str:
        """Format and return the GFF file as a string."""
        header: str = '''##gff-version    3
##feature-ontology  so.obo
##attribute-ontology    gff3_attributes.obo
'''
        header += self._sequence_region_header()
        body: str = ''
        for _, seqreg in sorted(
            self.sequence_regions.items(), key=lambda x: x[0]
        ):
            for feature in sorted(
                (
                    entry for entry in seqreg.node_registry.values()
                    if entry.toplevel and not entry.parents
                ),
                key=lambda x: int(x.start)
            ):
                body += '###\n' + str(feature) + '\n'
        return header + body

fix_issues()

Recursively fix any issues found during validation that can be fixed automatically.

This needs to be run after GFFFile.validate, otherwise no issues have been populated.

Source code in src/geffa/geffa.py
def fix_issues(self) -> None:
    """Recursively fix any issues found during validation that can be
    fixed automatically.

    This needs to be run after `GFFFile.validate`, otherwise no issues
    have been populated.
    """
    def fix_issues_recursively(node) -> None:
        # Issues are fixed depth-first.
        for child in node.children:
            fix_issues_recursively(child)
        while node.issues:
            issue: Issue = node.issues[0]
            del node.issues[0]
            logger.debug(
                f'{issue.node.attributes["ID"]}: Fixing issue of type '
                f'{type(issue).__name__}'
            )
            try:
                issue.fix()
            except NotImplementedError:
                logger.debug(
                    f'{issue.node.attributes["ID"]}: Unable to fix.')

    # Go through all sequence regions and run fixes.
    for seqreg in self.sequence_regions.values():
        for gene in seqreg.nodes_of_type(GeneNode):
            while True:
                try:
                    fix_issues_recursively(gene)
                except RevalidationNecessary:
                    # On a gene-level, revalidation after a fix might be
                    # necessary.
                    logger.debug(
                        f'{gene.attributes["ID"]}: Revalidating after fix')
                    gene._validate()
                    continue
                break

gather_issues()

Collect issues from all feature nodes. Returns a dict with the issue type as key and a list of occurrences as the value.

Source code in src/geffa/geffa.py
def gather_issues(self) -> dict[str, list[Issue]]:
    """Collect issues from all feature nodes. Returns a dict with the
    issue type as key and a list of occurrences as the value."""
    issues: defaultdict[str, list[Issue]] = defaultdict(list)
    for seqreg in self.sequence_regions.values():
        seqreg.validate()
        for node in seqreg.node_registry.values():
            for issue in node.issues:
                issues[type(issue).__name__].append(issue)
    return dict(issues)

merge(gff)

Merge another GFF file into this one.

Source code in src/geffa/geffa.py
def merge(self, gff: GffFile):
    """Merge another GFF file into this one."""
    srnames_this: set[str] = set(self.sequence_regions)
    srnames_other: set[str] = set(gff.sequence_regions)
    if srnames_this.intersection(srnames_other):
        raise ValueError(
            'Unable to merge because the given GffFile object contains '
            'sequence region names that overlap with existing sequence '
            'region names.'
        )
    self._contigs.update(gff._contigs)
    self.sequence_regions.update(gff.sequence_regions)

save(filename, include_sequences=False)

Save the GFF file.

Parameters:

Name Type Description Default
filename str

Name of the output file.

required
include_sequences bool

Whether to append the sequences as a FASTA portion (default `False)

False
Source code in src/geffa/geffa.py
def save(self, filename: str, include_sequences: bool = False):
    """Save the GFF file.

    Args:
        filename: Name of the output file.
        include_sequences: Whether to append the sequences as a FASTA
            portion (default `False)
    """
    with open(filename, 'w') as f:
        f.write(str(self))
        if include_sequences:
            f.write('##FASTA\n')
            for seqreg in self.sequence_regions.values():
                if (
                    (len(seqreg.node_registry) == 0) or
                    (seqreg.sequence is None)
                ):
                    continue
                seq: str = str(seqreg.sequence)
                f.write(f'>{seqreg.name}\n')
                for i in range(0, len(seq)+1, 80):
                    f.write(seq[i:i+80] + '\n')

validate()

Validate the GFF file, its sequence regions and the contained feature nodes.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the GFF file, its sequence regions and the contained
    feature nodes."""
    for seqreg in self.sequence_regions.values():
        seqreg.validate()

validation_report()

Return a report on issues found as a string.

Source code in src/geffa/geffa.py
def validation_report(self) -> str:
    """Return a report on issues found as a string."""
    issues: dict[str, list[Issue]] = self.gather_issues()

    report: str = 'Issue summary:\n'
    if len(issues) == 0:
        report += 'No issues found.\n'
    else:
        for issue_type in sorted(issues):
            report += f' * {issue_type}: {len(issues[issue_type])} found\n'

        for issue_type in sorted(issues):
            report += f'\n#### {issue_type} ####\n'
            for issue in issues[issue_type]:
                report += str(issue) + '\n'

    return report

Issue

Base type for an issue found within the GFF file. Cannot be used as is, needs to be subclassed.

Source code in src/geffa/geffa.py
class Issue:
    """Base type for an issue found within the GFF file. Cannot be used as is,
    needs to be subclassed."""
    def __init__(self, node) -> None:
        self.node = node

    def fix(self) -> None:
        """Implement to fix the raised issue (if possible)."""
        raise NotImplementedError

    def _debug_log_msg(self, msg) -> None:
        logger.debug(f'{self.node.attributes["ID"]}: {msg}')

    def __message__(self) -> str:
        """Implement to display an informative issue message."""
        raise NotImplementedError

    def __str__(self) -> str:
        return (
            f'Issue with {self.node.type} {self.node.attributes["ID"]} '
            '(line {self.node.line_nr}): {self.__message__()}'
        )

__message__()

Implement to display an informative issue message.

Source code in src/geffa/geffa.py
def __message__(self) -> str:
    """Implement to display an informative issue message."""
    raise NotImplementedError

fix()

Implement to fix the raised issue (if possible).

Source code in src/geffa/geffa.py
def fix(self) -> None:
    """Implement to fix the raised issue (if possible)."""
    raise NotImplementedError

MRNANode

Bases: Node

Node type describing an mRNA feature.

Source code in src/geffa/geffa.py
class MRNANode(Node):
    """Node type describing an mRNA feature."""
    type: str = 'mRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the mRNA node.

        This does basic consistence checks on the metadata, checks if the CDS
        starts with a start codon, ends with a stop codon and has no internal
        stop codons. It also checks if the exons cover the CDSs completely.
        """
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for mRNA.')

        if len(self.parents) != 1:
            raise ValueError('mRNA needs exactly one parent')

        if self.parents[0].type != 'gene':
            raise ValueError('mRNA parent needs to be gene')

        # Validate CDS child feature phases
        CDSs: list[Node] = self.CDS_children()
        if len(CDSs) > 0:
            phases = self.calculate_CDS_phases()
            for p, CDS in zip(phases, CDSs):
                if p != CDS.phase:
                    self.issues.append(CDSPhaseIncorrectIssue(CDS, p))

        # Validate start / stop codon presence and positon.
        # Only done when the coding sequence is available.
        seq: Seq | None = self.coding_sequence()
        if seq is not None:
            first_codon: Seq = seq[:3]
            last_codon: Seq = seq[3*(len(seq)//3)-3:3*(len(seq)//3)]
            if first_codon not in START_CODONS:
                self.issues.append(CodingSequenceStartIsNotStartCodon(self))
            if last_codon not in STOP_CODONS:
                self.issues.append(CodingSequenceEndIsNotStopCodon(self))

            # Check for internal stop codons
            codons: list[str] = [
                str(seq[i:i+3]) for i in range(0, len(seq)-3, 3)]
            for stop_codon in STOP_CODONS:
                if stop_codon in codons:
                    self.issues.append(
                        CodingSequenceContainsInternalStopCodons(self))
                    break

        # Checks if the exon child features cover the contained CDSs
        # completely.
        exons: list[Node] = [
            child for child in self.children if child.type == 'exon']
        for exon in exons:
            for CDS in CDSs:
                if (
                    ((CDS.start < exon.start) and (CDS.end > exon.start)) or
                    ((CDS.start < exon.end) and (CDS.end > exon.end))
                ):
                    self.issues.append(
                        ExonDoesNotContainCDSCompletely(exon, CDS))

    def calculate_CDS_phases(self) -> list[int]:
        """Calculates the phase values for all CDS child features."""
        CDSs: list[Node] = self.CDS_children()
        if len(CDSs) == 0:
            return []
        phases: list[int] = [0]
        for i, CDS in enumerate(CDSs[1:]):
            # Calculate phase needed. The phase of the first CDS feature is
            # always p_1=0. (i.e. the first codon of the first CDS always
            # starts at the bp indicated by the start coordinate) The
            # subsequent phases p_i are calculated by taking the length of the
            # preceding CDS L_{i-1}=e_{i-1}-s_{i-1}+1 and taking its modulo 3,
            # L_{i-1} % 3, to calculate the number of "overhanging" bps, then
            # subtracting this from 3 to get the number of missing bps having
            # to be filled in at the current CDS, plus the preceding phase
            # p_{i-1} and everything modulo 3 again. In summary:
            #  p_{i} = [ p_{i-1} + 3 - ( e_{i-1} - s_{i-1} + 1 ) % 3 ] % 3
            phases.append(
                (phases[-1] + 3-((CDSs[i].end - CDSs[i].start + 1) % 3)) % 3)
        return phases

    def CDS_children(self) -> list[CDSNode]:
        """Returns a sorted list of CDS child features."""
        CDSs: list[Node] = sorted(
            [
                child for child in self.children
                if child.type == 'CDS'
            ],
            key=lambda c: c.start
        )
        if self.strand == '-':
            CDSs.reverse()
        return CDSs

    def coding_sequence(self, extend_start_bps=0, extend_end_bps=0):
        """Returns the nucleotide coding sequence covered by the contained
        CDSs."""
        CDSs: list[Node] = self.CDS_children()
        if (len(CDSs) == 0) or (self.sequence_region.sequence is None):
            return None
        return Seq(''.join([str(CDS.sequence) for CDS in CDSs]))

    def protein_sequence(self) -> str | None:
        """Returns the amino acid sequence corresponding to the mRNA's coding
        sequence."""
        coding_sequence: Seq | None = self.coding_sequence()
        if coding_sequence is None:
            return None
        protein_sequence: str = ""
        for codon in textwrap.wrap(coding_sequence, 3):
            try:
                protein_sequence += CODON_TRANSLATION_TABLE[codon]
            except KeyError:
                protein_sequence += 'X'
        return protein_sequence

    def extend_coordinates(
        self,
        new_start: int | None = None,
        new_end: int | None = None
    ):
        """Extend the start and stop coordinates of the feature to the given
        coordinates.

        Crucially, this only allows extending, a feature cannot be shrunk this
        way (i.e. the new start cannot be to the right of the original).
        It also extends the parent's coordinates.

        Args:
            new_start: New start coordinate (default `None`).
            new_end: New end coordintate (default `None`).
        """
        super().extend_coordinates(new_start, new_end)
        self.parents[0].extend_coordinates(new_start, new_end)

CDS_children()

Returns a sorted list of CDS child features.

Source code in src/geffa/geffa.py
def CDS_children(self) -> list[CDSNode]:
    """Returns a sorted list of CDS child features."""
    CDSs: list[Node] = sorted(
        [
            child for child in self.children
            if child.type == 'CDS'
        ],
        key=lambda c: c.start
    )
    if self.strand == '-':
        CDSs.reverse()
    return CDSs

calculate_CDS_phases()

Calculates the phase values for all CDS child features.

Source code in src/geffa/geffa.py
def calculate_CDS_phases(self) -> list[int]:
    """Calculates the phase values for all CDS child features."""
    CDSs: list[Node] = self.CDS_children()
    if len(CDSs) == 0:
        return []
    phases: list[int] = [0]
    for i, CDS in enumerate(CDSs[1:]):
        # Calculate phase needed. The phase of the first CDS feature is
        # always p_1=0. (i.e. the first codon of the first CDS always
        # starts at the bp indicated by the start coordinate) The
        # subsequent phases p_i are calculated by taking the length of the
        # preceding CDS L_{i-1}=e_{i-1}-s_{i-1}+1 and taking its modulo 3,
        # L_{i-1} % 3, to calculate the number of "overhanging" bps, then
        # subtracting this from 3 to get the number of missing bps having
        # to be filled in at the current CDS, plus the preceding phase
        # p_{i-1} and everything modulo 3 again. In summary:
        #  p_{i} = [ p_{i-1} + 3 - ( e_{i-1} - s_{i-1} + 1 ) % 3 ] % 3
        phases.append(
            (phases[-1] + 3-((CDSs[i].end - CDSs[i].start + 1) % 3)) % 3)
    return phases

coding_sequence(extend_start_bps=0, extend_end_bps=0)

Returns the nucleotide coding sequence covered by the contained CDSs.

Source code in src/geffa/geffa.py
def coding_sequence(self, extend_start_bps=0, extend_end_bps=0):
    """Returns the nucleotide coding sequence covered by the contained
    CDSs."""
    CDSs: list[Node] = self.CDS_children()
    if (len(CDSs) == 0) or (self.sequence_region.sequence is None):
        return None
    return Seq(''.join([str(CDS.sequence) for CDS in CDSs]))

extend_coordinates(new_start=None, new_end=None)

Extend the start and stop coordinates of the feature to the given coordinates.

Crucially, this only allows extending, a feature cannot be shrunk this way (i.e. the new start cannot be to the right of the original). It also extends the parent's coordinates.

Parameters:

Name Type Description Default
new_start int | None

New start coordinate (default None).

None
new_end int | None

New end coordintate (default None).

None
Source code in src/geffa/geffa.py
def extend_coordinates(
    self,
    new_start: int | None = None,
    new_end: int | None = None
):
    """Extend the start and stop coordinates of the feature to the given
    coordinates.

    Crucially, this only allows extending, a feature cannot be shrunk this
    way (i.e. the new start cannot be to the right of the original).
    It also extends the parent's coordinates.

    Args:
        new_start: New start coordinate (default `None`).
        new_end: New end coordintate (default `None`).
    """
    super().extend_coordinates(new_start, new_end)
    self.parents[0].extend_coordinates(new_start, new_end)

protein_sequence()

Returns the amino acid sequence corresponding to the mRNA's coding sequence.

Source code in src/geffa/geffa.py
def protein_sequence(self) -> str | None:
    """Returns the amino acid sequence corresponding to the mRNA's coding
    sequence."""
    coding_sequence: Seq | None = self.coding_sequence()
    if coding_sequence is None:
        return None
    protein_sequence: str = ""
    for codon in textwrap.wrap(coding_sequence, 3):
        try:
            protein_sequence += CODON_TRANSLATION_TABLE[codon]
        except KeyError:
            protein_sequence += 'X'
    return protein_sequence

validate()

Validate the mRNA node.

This does basic consistence checks on the metadata, checks if the CDS starts with a start codon, ends with a stop codon and has no internal stop codons. It also checks if the exons cover the CDSs completely.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the mRNA node.

    This does basic consistence checks on the metadata, checks if the CDS
    starts with a start codon, ends with a stop codon and has no internal
    stop codons. It also checks if the exons cover the CDSs completely.
    """
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for mRNA.')

    if len(self.parents) != 1:
        raise ValueError('mRNA needs exactly one parent')

    if self.parents[0].type != 'gene':
        raise ValueError('mRNA parent needs to be gene')

    # Validate CDS child feature phases
    CDSs: list[Node] = self.CDS_children()
    if len(CDSs) > 0:
        phases = self.calculate_CDS_phases()
        for p, CDS in zip(phases, CDSs):
            if p != CDS.phase:
                self.issues.append(CDSPhaseIncorrectIssue(CDS, p))

    # Validate start / stop codon presence and positon.
    # Only done when the coding sequence is available.
    seq: Seq | None = self.coding_sequence()
    if seq is not None:
        first_codon: Seq = seq[:3]
        last_codon: Seq = seq[3*(len(seq)//3)-3:3*(len(seq)//3)]
        if first_codon not in START_CODONS:
            self.issues.append(CodingSequenceStartIsNotStartCodon(self))
        if last_codon not in STOP_CODONS:
            self.issues.append(CodingSequenceEndIsNotStopCodon(self))

        # Check for internal stop codons
        codons: list[str] = [
            str(seq[i:i+3]) for i in range(0, len(seq)-3, 3)]
        for stop_codon in STOP_CODONS:
            if stop_codon in codons:
                self.issues.append(
                    CodingSequenceContainsInternalStopCodons(self))
                break

    # Checks if the exon child features cover the contained CDSs
    # completely.
    exons: list[Node] = [
        child for child in self.children if child.type == 'exon']
    for exon in exons:
        for CDS in CDSs:
            if (
                ((CDS.start < exon.start) and (CDS.end > exon.start)) or
                ((CDS.start < exon.end) and (CDS.end > exon.end))
            ):
                self.issues.append(
                    ExonDoesNotContainCDSCompletely(exon, CDS))

NcRNANode

Bases: Node

Node type describing a non-coding RNA feature.

Source code in src/geffa/geffa.py
class NcRNANode(Node):
    """Node type describing a non-coding RNA feature."""
    type: str = 'ncRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the ncRNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for ncRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('ncRNA parent needs to be gene')

validate()

Validate the ncRNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the ncRNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for ncRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('ncRNA parent needs to be gene')

Node

Base class of a GFF Node

Source code in src/geffa/geffa.py
class Node:
    """Base class of a GFF Node"""
    type: str = "__node__"

    def __init__(
            self,
            line_nr: int,
            sequence_region: SequenceRegion,
            source: str,
            entry_type: str,
            start: int | str,
            end: int | str,
            score: int | str,
            strand: str,
            phase: int | str,
            attributes: str,
            extra_feature_type: str | None = None,
            *args, **kwargs) -> None:
        """Initialise a GFF Node

        Args:
            line_nr: The line number the `Node` is defined at in the GFF file
            sequence_region: The `SequenceRegion` the `Node` belongs to.
            source: The source specifier from the line in the GFF file.
            entry_type: The type of `Node`.
            start: The start coordinate of the `Node`.
            end: The end coordinate of the `Node`.
            score: The score value of the `Node`.
            phase: The phase value of the `Node`.
        """
        start: int = int(start)
        end: int = int(end)
        if start > end:
            raise ValueError(
                f'Start needs to come before end at line nr {line_nr}.')
        if not issubclass(type(sequence_region), SequenceRegion):
            raise ValueError(
                'sequence_region needs to be of type SequenceRegion.')
        if strand not in ['+', '-', '.']:
            raise ValueError(f'Invalid strand value at line nr {line_nr}')
        if phase not in ['.', '0', '1', '2']:
            raise ValueError(f'Invalid CDS phase value at line nr {line_nr}.')
        if not issubclass(type(attributes), str):
            raise ValueError('attributes needs to be a string.')
        if entry_type != self.type:
            raise ValueError('Called wrong node type!')

        self.issues: list[Issue] = []
        self.line_nr = line_nr
        self.sequence_region = sequence_region
        self.source = source
        self.start: int = int(start)
        self.end: int = int(end)
        self.score = score
        self.strand = strand
        self.phase: str | int = phase if phase == '.' else int(phase)
        try:
            self.attributes: dict[str, str] = dict(
                item.split('=') for item in attributes.split(';'))
        except ValueError:
            raise ValueError(
                f'Invalid attributes entry on line nr {self.line_nr}.')

        # Allow for GFF-spec-subverting special feature types, such as
        # "protein_coding_gene"
        self.extra_feature_type: str | None = extra_feature_type

        self.children: list[Node] = []
        self.parents: list[Node] = []
        # Assign parents
        if 'Parent' in self.attributes:
            for pid in self.attributes['Parent'].split(','):
                # This assumes that any parents are already read in - should
                # be though because they need to be listed before any children.
                try:
                    self.parents.append(
                        self.sequence_region.node_registry[pid])
                except KeyError:
                    raise ValueError(
                        f'Invalid parent ID {pid} at line nr {self.line_nr}')
            for p in self.parents:
                p.children.append(self)
                if self.strand != p.strand:
                    raise ValueError(
                        "Strand is different between parent and child at line "
                        f"nr {self.line_nr}.")
        elif not self.toplevel:
            raise ValueError(
                'Node has no parent but cannot be top level either on line '
                f'{self.line_nr}')

        self.sequence_region.add_node(self)

    def add_parent(self, parent: Node):
        if self.strand != parent.strand:
            raise ValueError("Strand is different between parent and child.")
        self.parents.append(parent)
        parent.children.append(self)
        self.attributes['Parent'] = ','.join(
            [p.attributes['ID'] for p in self.parents])

    @property
    def sequence(self) -> Seq | None:
        """The nucleotide sequence of the feature (if the sequence region's
        sequence is available, otherwise `None`)."""
        region_seq: Seq | None = self.sequence_region.sequence
        if region_seq is None:
            return None

        seq: Seq = region_seq[self.start-1:self.end]
        if self.strand == '-':
            seq = seq.reverse_complement()

        return seq

    def __len__(self) -> int:
        """Length (in nucleotides) of the feature."""
        return self.end - self.start + 1

    def _validate(self) -> None:
        # Run validation on this node and its children.
        self.issues = []
        self._validate_start_end()
        self.validate()
        for child in self.children:
            child._validate()

    def _validate_start_end(self) -> None:
        # Validate that the start and end of this feature is within the bounds
        # of its parents and the sequence region
        for p in self.parents:
            if self.start < p.start:
                self.issues.append(FeatureStartLeftOfParentIssue(self, p))
            if self.end > p.end:
                self.issues.append(FeatureEndRightOfParentIssue(self, p))
        if self.start < self.sequence_region.start:
            self.issues.append(FeatureStartLeftOfSequenceRegionStart(self))
        if self.end > self.sequence_region.end:
            self.issues.append(FeatureEndRightOfSequenceRegionEnd(self))

    def delete(self) -> None:
        """Delete the feature node and any of its children."""
        for child in list(self.children):
            child.delete()
        for p in self.parents:
            p.children.remove(self)
        del self.sequence_region.node_registry[self.attributes['ID']]

    def __str__(self) -> str:
        attr_str: str = ';'.join(
            [f'{key}={value}' for key, value in self.attributes.items()])
        if self.extra_feature_type is None:
            feature_type = self.type
        else:
            feature_type = self.extra_feature_type
        entries: list[str] = [
            f'{self.sequence_region.name}\t{self.source}\t{feature_type}\t'
            f'{self.start}\t{self.end}\t{self.score}\t{self.strand}\t'
            f'{self.phase}\t{attr_str}']
        for child in self.children:
            entries.append(str(child))
        return '\n'.join(entries)

    def __repr__(self) -> str:
        output_type = (
            self.type if self.extra_feature_type is None
            else self.extra_feature_type)
        return f'{output_type} {self.strand}[{self.start}, {self.end}]'

    def apply_recursively(self, func: Callable[[Node], Any]):
        """Apply the given function recursively to the node and its children.

        This is done depth-first, meaning any children are iterated over first.

        Args:
            func: Function to apply (needs to take the `Node` as a single
                argument, return is ignored).
        """
        for child in self.children:
            child.apply_recursively(func)
        return func(self)

    def extend_coordinates(
        self,
        new_start: int | None = None,
        new_end: int | None = None
    ):
        """Extend the start and stop coordinates of the feature to the given
        coordinates.

        Crucially, this only allows extending, a feature cannot be shrunk this
        way (i.e. the new start cannot be to the right of the original).

        Args:
            new_start: New start coordinate (default `None`).
            new_end: New end coordintate (default `None`).
        """
        if new_start is not None:
            self.start = min(self.start, new_start)
        if new_end is not None:
            self.end = max(self.end, new_end)

    def to_dict(self) -> dict[str, Any]:
        """Returns a `dict` containing the feature parameters."""
        return {
            'type': self.type,
            'start': self.start,
            'end': self.end,
            'phase': self.phase,
            'strand': self.strand,
            'score': self.score,
            'attributes': self.attributes,
            'sequence_region': self.sequence_region.name,
        }

    def children_of_type(self, node_type: type[NodeType]) -> list[NodeType]:
        """Return a list of all direct children of the given type.

        Args:
            node_type: Type of the nodes to be returned, e.g. `MRNANode`, or
                `None` in which case all nodes in the sequence region will be
                returned. Default `None`.

        Returns:
            A list containing all direct child nodes of the given type.
        """
        children = cast(list[NodeType], self.children)
        return [
            node for node in children
            if node.type == node_type.type
        ]

    def has_child_of_type(self, type: str) -> bool:
        """Check whether this node as a child of the given node.

        Args:
            type: String describing the type of node to check for
                (i.e. "mRNA").

        Returns:
            Whether a child of the given type exists.
        """
        for child in self.children:
            if child.type == type:
                return True
        return False

sequence property

The nucleotide sequence of the feature (if the sequence region's sequence is available, otherwise None).

__init__(line_nr, sequence_region, source, entry_type, start, end, score, strand, phase, attributes, extra_feature_type=None, *args, **kwargs)

Initialise a GFF Node

Parameters:

Name Type Description Default
line_nr int

The line number the Node is defined at in the GFF file

required
sequence_region SequenceRegion

The SequenceRegion the Node belongs to.

required
source str

The source specifier from the line in the GFF file.

required
entry_type str

The type of Node.

required
start int | str

The start coordinate of the Node.

required
end int | str

The end coordinate of the Node.

required
score int | str

The score value of the Node.

required
phase int | str

The phase value of the Node.

required
Source code in src/geffa/geffa.py
def __init__(
        self,
        line_nr: int,
        sequence_region: SequenceRegion,
        source: str,
        entry_type: str,
        start: int | str,
        end: int | str,
        score: int | str,
        strand: str,
        phase: int | str,
        attributes: str,
        extra_feature_type: str | None = None,
        *args, **kwargs) -> None:
    """Initialise a GFF Node

    Args:
        line_nr: The line number the `Node` is defined at in the GFF file
        sequence_region: The `SequenceRegion` the `Node` belongs to.
        source: The source specifier from the line in the GFF file.
        entry_type: The type of `Node`.
        start: The start coordinate of the `Node`.
        end: The end coordinate of the `Node`.
        score: The score value of the `Node`.
        phase: The phase value of the `Node`.
    """
    start: int = int(start)
    end: int = int(end)
    if start > end:
        raise ValueError(
            f'Start needs to come before end at line nr {line_nr}.')
    if not issubclass(type(sequence_region), SequenceRegion):
        raise ValueError(
            'sequence_region needs to be of type SequenceRegion.')
    if strand not in ['+', '-', '.']:
        raise ValueError(f'Invalid strand value at line nr {line_nr}')
    if phase not in ['.', '0', '1', '2']:
        raise ValueError(f'Invalid CDS phase value at line nr {line_nr}.')
    if not issubclass(type(attributes), str):
        raise ValueError('attributes needs to be a string.')
    if entry_type != self.type:
        raise ValueError('Called wrong node type!')

    self.issues: list[Issue] = []
    self.line_nr = line_nr
    self.sequence_region = sequence_region
    self.source = source
    self.start: int = int(start)
    self.end: int = int(end)
    self.score = score
    self.strand = strand
    self.phase: str | int = phase if phase == '.' else int(phase)
    try:
        self.attributes: dict[str, str] = dict(
            item.split('=') for item in attributes.split(';'))
    except ValueError:
        raise ValueError(
            f'Invalid attributes entry on line nr {self.line_nr}.')

    # Allow for GFF-spec-subverting special feature types, such as
    # "protein_coding_gene"
    self.extra_feature_type: str | None = extra_feature_type

    self.children: list[Node] = []
    self.parents: list[Node] = []
    # Assign parents
    if 'Parent' in self.attributes:
        for pid in self.attributes['Parent'].split(','):
            # This assumes that any parents are already read in - should
            # be though because they need to be listed before any children.
            try:
                self.parents.append(
                    self.sequence_region.node_registry[pid])
            except KeyError:
                raise ValueError(
                    f'Invalid parent ID {pid} at line nr {self.line_nr}')
        for p in self.parents:
            p.children.append(self)
            if self.strand != p.strand:
                raise ValueError(
                    "Strand is different between parent and child at line "
                    f"nr {self.line_nr}.")
    elif not self.toplevel:
        raise ValueError(
            'Node has no parent but cannot be top level either on line '
            f'{self.line_nr}')

    self.sequence_region.add_node(self)

__len__()

Length (in nucleotides) of the feature.

Source code in src/geffa/geffa.py
def __len__(self) -> int:
    """Length (in nucleotides) of the feature."""
    return self.end - self.start + 1

apply_recursively(func)

Apply the given function recursively to the node and its children.

This is done depth-first, meaning any children are iterated over first.

Parameters:

Name Type Description Default
func Callable[[Node], Any]

Function to apply (needs to take the Node as a single argument, return is ignored).

required
Source code in src/geffa/geffa.py
def apply_recursively(self, func: Callable[[Node], Any]):
    """Apply the given function recursively to the node and its children.

    This is done depth-first, meaning any children are iterated over first.

    Args:
        func: Function to apply (needs to take the `Node` as a single
            argument, return is ignored).
    """
    for child in self.children:
        child.apply_recursively(func)
    return func(self)

children_of_type(node_type)

Return a list of all direct children of the given type.

Parameters:

Name Type Description Default
node_type type[NodeType]

Type of the nodes to be returned, e.g. MRNANode, or None in which case all nodes in the sequence region will be returned. Default None.

required

Returns:

Type Description
list[NodeType]

A list containing all direct child nodes of the given type.

Source code in src/geffa/geffa.py
def children_of_type(self, node_type: type[NodeType]) -> list[NodeType]:
    """Return a list of all direct children of the given type.

    Args:
        node_type: Type of the nodes to be returned, e.g. `MRNANode`, or
            `None` in which case all nodes in the sequence region will be
            returned. Default `None`.

    Returns:
        A list containing all direct child nodes of the given type.
    """
    children = cast(list[NodeType], self.children)
    return [
        node for node in children
        if node.type == node_type.type
    ]

delete()

Delete the feature node and any of its children.

Source code in src/geffa/geffa.py
def delete(self) -> None:
    """Delete the feature node and any of its children."""
    for child in list(self.children):
        child.delete()
    for p in self.parents:
        p.children.remove(self)
    del self.sequence_region.node_registry[self.attributes['ID']]

extend_coordinates(new_start=None, new_end=None)

Extend the start and stop coordinates of the feature to the given coordinates.

Crucially, this only allows extending, a feature cannot be shrunk this way (i.e. the new start cannot be to the right of the original).

Parameters:

Name Type Description Default
new_start int | None

New start coordinate (default None).

None
new_end int | None

New end coordintate (default None).

None
Source code in src/geffa/geffa.py
def extend_coordinates(
    self,
    new_start: int | None = None,
    new_end: int | None = None
):
    """Extend the start and stop coordinates of the feature to the given
    coordinates.

    Crucially, this only allows extending, a feature cannot be shrunk this
    way (i.e. the new start cannot be to the right of the original).

    Args:
        new_start: New start coordinate (default `None`).
        new_end: New end coordintate (default `None`).
    """
    if new_start is not None:
        self.start = min(self.start, new_start)
    if new_end is not None:
        self.end = max(self.end, new_end)

has_child_of_type(type)

Check whether this node as a child of the given node.

Parameters:

Name Type Description Default
type str

String describing the type of node to check for (i.e. "mRNA").

required

Returns:

Type Description
bool

Whether a child of the given type exists.

Source code in src/geffa/geffa.py
def has_child_of_type(self, type: str) -> bool:
    """Check whether this node as a child of the given node.

    Args:
        type: String describing the type of node to check for
            (i.e. "mRNA").

    Returns:
        Whether a child of the given type exists.
    """
    for child in self.children:
        if child.type == type:
            return True
    return False

to_dict()

Returns a dict containing the feature parameters.

Source code in src/geffa/geffa.py
def to_dict(self) -> dict[str, Any]:
    """Returns a `dict` containing the feature parameters."""
    return {
        'type': self.type,
        'start': self.start,
        'end': self.end,
        'phase': self.phase,
        'strand': self.strand,
        'score': self.score,
        'attributes': self.attributes,
        'sequence_region': self.sequence_region.name,
    }

PASNode

Bases: Node

Node type describing a poly-adenylation site.

Source code in src/geffa/geffa.py
class PASNode(Node):
    """Node type describing a poly-adenylation site."""
    type: str = 'PAS'
    toplevel: bool = True

    def validate(self) -> None:
        """Validate the PAS feature."""
        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('PAS parent needs to be gene')
        if len(self) != 1:
            raise ValueError('PAS feature needs to be of length 1')

validate()

Validate the PAS feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the PAS feature."""
    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('PAS parent needs to be gene')
    if len(self) != 1:
        raise ValueError('PAS feature needs to be of length 1')

RRNANode

Bases: Node

Node type describing a ribosomal RNA feature.

Source code in src/geffa/geffa.py
class RRNANode(Node):
    """Node type describing a ribosomal RNA feature."""
    type: str = 'rRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the rRNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for rRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('rRNA parent needs to be gene')

validate()

Validate the rRNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the rRNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for rRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('rRNA parent needs to be gene')

RevalidationNecessary

Bases: Exception

Exception raised to trigger revalidation of the GFF

Source code in src/geffa/geffa.py
class RevalidationNecessary(Exception):
    """Exception raised to trigger revalidation of the GFF"""
    pass

SLASNode

Bases: Node

Node type describing a splice leader acceptor site.

Source code in src/geffa/geffa.py
class SLASNode(Node):
    """Node type describing a splice leader acceptor site."""
    type: str = 'SLAS'
    toplevel: bool = True

    def validate(self) -> None:
        """Validate a SLAS feature."""
        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('SLAS parent needs to be gene')
        if len(self) != 2:
            raise ValueError('SLAS feature needs to be of length 2')

validate()

Validate a SLAS feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate a SLAS feature."""
    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('SLAS parent needs to be gene')
    if len(self) != 2:
        raise ValueError('SLAS feature needs to be of length 2')

STARTNode

Bases: Node

Node type describing a start codon site.

Source code in src/geffa/geffa.py
class STARTNode(Node):
    """Node type describing a start codon site."""
    type: str = 'START'
    toplevel: bool = True

    def validate(self) -> None:
        # TODO: Validate that this actually is the site of a start codon if
        # sequence is present.
        pass

STOPNode

Bases: Node

Node type describing a stop codon site.

Source code in src/geffa/geffa.py
class STOPNode(Node):
    """Node type describing a stop codon site."""
    type: str = 'STOP'
    toplevel: bool = True

    def validate(self) -> None:
        # TODO: Validate that this actually is the site of a start codon if
        # sequence is present.
        pass

ScRNANode

Bases: Node

Node type describing a small conditional RNA feature.

Source code in src/geffa/geffa.py
class ScRNANode(Node):
    """Node type describing a small conditional RNA feature."""
    type: str = 'scRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the scRNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for scRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('scRNA parent needs to be gene')

validate()

Validate the scRNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the scRNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for scRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('scRNA parent needs to be gene')

Seq

Bases: str

Source code in src/geffa/geffa.py
class Seq(str):
    _TRANSLATION_TABLE: dict[int, int | None] = str.maketrans('ACTG', 'TGAC')

    """Sequence class to hold nucleotide sequence"""
    def reverse_complement(self) -> Seq:
        """Return the reverse complement of the given sequence"""
        return Seq(self[::-1].translate(Seq._TRANSLATION_TABLE))

    def __getitem__(self, *args, **kwargs) -> Seq:
        return Seq(super().__getitem__(*args, **kwargs))

reverse_complement()

Return the reverse complement of the given sequence

Source code in src/geffa/geffa.py
def reverse_complement(self) -> Seq:
    """Return the reverse complement of the given sequence"""
    return Seq(self[::-1].translate(Seq._TRANSLATION_TABLE))

SequenceRegion

Describes a sequence region (or contig).

Source code in src/geffa/geffa.py
class SequenceRegion:
    """Describes a sequence region (or contig)."""

    # Regular expression to check for gaps.
    _gap_re: Pattern[str] = re.compile('NNN+')  # Gaps are at least 3 Ns

    def __init__(
        self,
        name: str,
        start: int,
        end: int,
        sequence: str | None = None
    ):
        """Initialize the sequence region.

        Args:
            name: Name of the contig (no spaces allowed).
            start: Start coordinate of the contig.
            end: End coordinate of the contig.
            sequence: Nucleotide sequence of the contig.
        """
        if (not issubclass(type(name), str)) or (name == '') or (' ' in name):
            raise ValueError('Name needs to be a valid string without spaces')
        if start > end:
            raise ValueError('Start needs to come before end.')
        if sequence is not None:
            if end - start + 1 != len(sequence):
                end = start + len(sequence) - 1

        self.name: str = name
        self.start: int = start
        self.end: int = end
        self.sequence = Seq(sequence) if sequence is not None else None
        self.node_registry: dict[str, Node] = {}

    def validate(self) -> None:
        """Validate the sequence region and all the feature nodes it contains.
        Checks for CDS feature overlaps."""
        for node in self.node_registry.values():
            if node.type == 'gene':
                node._validate()

        # Check for CDS overlaps
        CDSs: list = sorted(
            [
                entry for entry in self.node_registry.values()
                if entry.type == 'CDS'
            ],
            key=lambda CDS: CDS.start
        )
        for i, CDS1 in enumerate(CDSs):
            for CDS2 in CDSs[i+1:]:
                if (CDS1.end >= CDS2.start):
                    CDS1.issues.append(CDSOverlap(CDS1, CDS2))
                    CDS2.issues.append(CDSOverlap(CDS2, CDS1))

    def add_node(self, node: Node) -> None:
        """Add a given node."""
        if node.attributes['ID'] in self.node_registry:
            raise ValueError(
                f'Node with ID "{node.attributes["ID"]}" already exists in '
                'node registry!'
            )
        if node.sequence_region != self:
            raise ValueError("Node doesn't belong to this sequence region!")

        self.node_registry[node.attributes["ID"]] = node

    def trim_sequence(
        self,
        start: int,
        end: int,
        delete_overlapping_features: bool = True,
    ):
        """Trim the sequence region to the given coordinates.

        Args:
            start: Trim to this start position.
            end: Trim to this end position.
            delete_overlapping_features: Allow deletion of features that
                overlap the new sequence region boundaries.
        """
        logger.debug(f'{self.name}: Trimming sequence region.')
        overlapping_features: list[Node] = [
            node for node in self.node_registry.values()
            if (((node.start <= start) and (node.end >= end)) or
                ((node.start >= start) and (node.end <= end)) or
                ((node.start <= start) and (node.end >= start)) or
                ((node.start <= end) and (node.end >= end)))]
        if (not delete_overlapping_features) and overlapping_features:
            raise NotImplementedError()
        overlapping_genes: list = []

        # BUG!!!!
        # This isn't currently being used to delete any genes, there is a bug!
        def recursively_find_gene(node) -> None:
            if node.type == 'gene':
                overlapping_features.add(node)
            else:
                for p in node.parents[0]:
                    recursively_find_gene(p)
        for gene in overlapping_genes:
            gene.delete()

        trim_length: int = end - start + 2
        affected_features: list[Node] = [
            node for node in self.node_registry.values() if node.start > end]
        for feature in affected_features:
            feature.start -= trim_length
            feature.end -= trim_length

        if self.sequence is not None:
            seq: str = str(self.sequence)
            new_seq: str = seq[:start-1] + seq[end+1:]
            self.sequence = Seq(new_seq)

    def closest_node_of_type(
        self,
        node: Node,
        node_types: list[str] | None = None,
        direction: Literal['forward', 'reverse', 'both'] = 'both',
        strand: Literal['-', '+'] | None = None
    ) -> list[Node]:
        '''Return the node closest to the given `node` of the specified
        type(s) in the specified direction. Please note that the direction is
        reversed if the given node's strand is '-'.

        For example, to find the CDS that's closest to the given SLAS node:

        ```python
        closest_cds = sequence_region.closest_node_of_type(
            slas_node,
            ["CDS"],
            "forward"
        )
        ```

        Args:
            node: The node / feature for which we want to find the closest
                match.
            node_types: The list of node types we allow in the results. Can be
                `None` in which case any node type is allowed. Default `None`.
            direction: Which stranded direction to search for (i.e. forward on
                the - strand will search towards the start of the sequence
                region). Default "both".
            strand: Only find nodes on the given strand. Can be `None` in
                which case the strand is not used for filtering.
                Default `None`.
        '''

        type_filter_func: Callable[[Node], bool] = lambda x: True
        if node_types is not None:
            if not isinstance(node_types, list):
                node_types: list[str] = [node_types]
            if strand is not None:
                type_filter_func = (
                    lambda x: x.type in node_types and x.strand == strand
                )
            else:
                type_filter_func = lambda x: x.type in node_types  # noqa: E731
        elif strand is not None:
            type_filter_func = lambda x: x.strand == strand  # noqa: E731

        # Get a list of all nodes of the given type(s) in the sequence region,
        # sorted by the start cordinate.
        if node.strand == '+':
            nodes: list[Node] = sorted(
                [
                    feature for feature in self.node_registry.values()
                    if type_filter_func(feature) and feature.start >= node.end
                    ],
                key=lambda x: x.start
            )
        elif node.strand == '-':
            nodes: list[Node] = sorted(
                [
                    feature for feature in self.node_registry.values()
                    if type_filter_func(feature) and feature.end <= node.end
                ],
                key=lambda x: x.start
            )

        if node.strand == '-':
            # NOTE: This works well if we don't have overlapping features. If
            # we do, then the behavior is somewhat undefined!
            pass
        found_nodes = []
        if direction == 'forward' or direction == 'both':
            idx_fwd: int = bisect.bisect_right(
                nodes, node.end, key=lambda x: x.start)
            if idx_fwd < len(nodes):  # No node found otherwise
                found_nodes.append(nodes[idx_fwd])
        if direction == 'backward' or direction == 'both':
            idx_back: int = bisect.bisect_left(
                nodes[::-1], node.start, key=lambda x: x.end)
            if idx_back < len(nodes):  # No node found otherwise
                found_nodes.append(nodes[idx_back])
        return sorted(found_nodes, key=lambda x: abs(node.start - x.start))

    def __str__(self) -> str:
        return f'##sequence-region\t{self.name}\t{self.start}\t{self.end}'

    def nodes_of_type(
        self,
        node_type: type[NodeType] | None = None,
    ) -> list[NodeType]:
        """Return a list of nodes of the given type within the sequence region
        of all nodes of `node_type=None`.

        For example, to get all gene features in a given sequence region, do:
        ```python
        from geffa.geffa import GffFile, GeneNode

        gff = GffFile(filename)
        all_genes = []
        for sequence_region in gff.sequence_regions.values():
            all_genes.extend(sequence_region.nodes_of_type(GeneNode))
        ```

        Args:
            node_type: Type of the nodes to be returned, e.g. `GeneNode`, or
                `None` in which case all nodes in the sequence region will be
                returned. Default `None`.

        Returns:
            A list containing all nodes of the given type in the sequence
            region.
        """
        if node_type is None:
            return list(self.node_registry.values())
        return [
            node for node in self.node_registry.values()
            if node.type == node_type.type
        ]

__init__(name, start, end, sequence=None)

Initialize the sequence region.

Parameters:

Name Type Description Default
name str

Name of the contig (no spaces allowed).

required
start int

Start coordinate of the contig.

required
end int

End coordinate of the contig.

required
sequence str | None

Nucleotide sequence of the contig.

None
Source code in src/geffa/geffa.py
def __init__(
    self,
    name: str,
    start: int,
    end: int,
    sequence: str | None = None
):
    """Initialize the sequence region.

    Args:
        name: Name of the contig (no spaces allowed).
        start: Start coordinate of the contig.
        end: End coordinate of the contig.
        sequence: Nucleotide sequence of the contig.
    """
    if (not issubclass(type(name), str)) or (name == '') or (' ' in name):
        raise ValueError('Name needs to be a valid string without spaces')
    if start > end:
        raise ValueError('Start needs to come before end.')
    if sequence is not None:
        if end - start + 1 != len(sequence):
            end = start + len(sequence) - 1

    self.name: str = name
    self.start: int = start
    self.end: int = end
    self.sequence = Seq(sequence) if sequence is not None else None
    self.node_registry: dict[str, Node] = {}

add_node(node)

Add a given node.

Source code in src/geffa/geffa.py
def add_node(self, node: Node) -> None:
    """Add a given node."""
    if node.attributes['ID'] in self.node_registry:
        raise ValueError(
            f'Node with ID "{node.attributes["ID"]}" already exists in '
            'node registry!'
        )
    if node.sequence_region != self:
        raise ValueError("Node doesn't belong to this sequence region!")

    self.node_registry[node.attributes["ID"]] = node

closest_node_of_type(node, node_types=None, direction='both', strand=None)

Return the node closest to the given node of the specified type(s) in the specified direction. Please note that the direction is reversed if the given node's strand is '-'.

For example, to find the CDS that's closest to the given SLAS node:

closest_cds = sequence_region.closest_node_of_type(
    slas_node,
    ["CDS"],
    "forward"
)

Parameters:

Name Type Description Default
node Node

The node / feature for which we want to find the closest match.

required
node_types list[str] | None

The list of node types we allow in the results. Can be None in which case any node type is allowed. Default None.

None
direction Literal['forward', 'reverse', 'both']

Which stranded direction to search for (i.e. forward on the - strand will search towards the start of the sequence region). Default "both".

'both'
strand Literal['-', '+'] | None

Only find nodes on the given strand. Can be None in which case the strand is not used for filtering. Default None.

None
Source code in src/geffa/geffa.py
def closest_node_of_type(
    self,
    node: Node,
    node_types: list[str] | None = None,
    direction: Literal['forward', 'reverse', 'both'] = 'both',
    strand: Literal['-', '+'] | None = None
) -> list[Node]:
    '''Return the node closest to the given `node` of the specified
    type(s) in the specified direction. Please note that the direction is
    reversed if the given node's strand is '-'.

    For example, to find the CDS that's closest to the given SLAS node:

    ```python
    closest_cds = sequence_region.closest_node_of_type(
        slas_node,
        ["CDS"],
        "forward"
    )
    ```

    Args:
        node: The node / feature for which we want to find the closest
            match.
        node_types: The list of node types we allow in the results. Can be
            `None` in which case any node type is allowed. Default `None`.
        direction: Which stranded direction to search for (i.e. forward on
            the - strand will search towards the start of the sequence
            region). Default "both".
        strand: Only find nodes on the given strand. Can be `None` in
            which case the strand is not used for filtering.
            Default `None`.
    '''

    type_filter_func: Callable[[Node], bool] = lambda x: True
    if node_types is not None:
        if not isinstance(node_types, list):
            node_types: list[str] = [node_types]
        if strand is not None:
            type_filter_func = (
                lambda x: x.type in node_types and x.strand == strand
            )
        else:
            type_filter_func = lambda x: x.type in node_types  # noqa: E731
    elif strand is not None:
        type_filter_func = lambda x: x.strand == strand  # noqa: E731

    # Get a list of all nodes of the given type(s) in the sequence region,
    # sorted by the start cordinate.
    if node.strand == '+':
        nodes: list[Node] = sorted(
            [
                feature for feature in self.node_registry.values()
                if type_filter_func(feature) and feature.start >= node.end
                ],
            key=lambda x: x.start
        )
    elif node.strand == '-':
        nodes: list[Node] = sorted(
            [
                feature for feature in self.node_registry.values()
                if type_filter_func(feature) and feature.end <= node.end
            ],
            key=lambda x: x.start
        )

    if node.strand == '-':
        # NOTE: This works well if we don't have overlapping features. If
        # we do, then the behavior is somewhat undefined!
        pass
    found_nodes = []
    if direction == 'forward' or direction == 'both':
        idx_fwd: int = bisect.bisect_right(
            nodes, node.end, key=lambda x: x.start)
        if idx_fwd < len(nodes):  # No node found otherwise
            found_nodes.append(nodes[idx_fwd])
    if direction == 'backward' or direction == 'both':
        idx_back: int = bisect.bisect_left(
            nodes[::-1], node.start, key=lambda x: x.end)
        if idx_back < len(nodes):  # No node found otherwise
            found_nodes.append(nodes[idx_back])
    return sorted(found_nodes, key=lambda x: abs(node.start - x.start))

nodes_of_type(node_type=None)

Return a list of nodes of the given type within the sequence region of all nodes of node_type=None.

For example, to get all gene features in a given sequence region, do:

from geffa.geffa import GffFile, GeneNode

gff = GffFile(filename)
all_genes = []
for sequence_region in gff.sequence_regions.values():
    all_genes.extend(sequence_region.nodes_of_type(GeneNode))

Parameters:

Name Type Description Default
node_type type[NodeType] | None

Type of the nodes to be returned, e.g. GeneNode, or None in which case all nodes in the sequence region will be returned. Default None.

None

Returns:

Type Description
list[NodeType]

A list containing all nodes of the given type in the sequence

list[NodeType]

region.

Source code in src/geffa/geffa.py
def nodes_of_type(
    self,
    node_type: type[NodeType] | None = None,
) -> list[NodeType]:
    """Return a list of nodes of the given type within the sequence region
    of all nodes of `node_type=None`.

    For example, to get all gene features in a given sequence region, do:
    ```python
    from geffa.geffa import GffFile, GeneNode

    gff = GffFile(filename)
    all_genes = []
    for sequence_region in gff.sequence_regions.values():
        all_genes.extend(sequence_region.nodes_of_type(GeneNode))
    ```

    Args:
        node_type: Type of the nodes to be returned, e.g. `GeneNode`, or
            `None` in which case all nodes in the sequence region will be
            returned. Default `None`.

    Returns:
        A list containing all nodes of the given type in the sequence
        region.
    """
    if node_type is None:
        return list(self.node_registry.values())
    return [
        node for node in self.node_registry.values()
        if node.type == node_type.type
    ]

trim_sequence(start, end, delete_overlapping_features=True)

Trim the sequence region to the given coordinates.

Parameters:

Name Type Description Default
start int

Trim to this start position.

required
end int

Trim to this end position.

required
delete_overlapping_features bool

Allow deletion of features that overlap the new sequence region boundaries.

True
Source code in src/geffa/geffa.py
def trim_sequence(
    self,
    start: int,
    end: int,
    delete_overlapping_features: bool = True,
):
    """Trim the sequence region to the given coordinates.

    Args:
        start: Trim to this start position.
        end: Trim to this end position.
        delete_overlapping_features: Allow deletion of features that
            overlap the new sequence region boundaries.
    """
    logger.debug(f'{self.name}: Trimming sequence region.')
    overlapping_features: list[Node] = [
        node for node in self.node_registry.values()
        if (((node.start <= start) and (node.end >= end)) or
            ((node.start >= start) and (node.end <= end)) or
            ((node.start <= start) and (node.end >= start)) or
            ((node.start <= end) and (node.end >= end)))]
    if (not delete_overlapping_features) and overlapping_features:
        raise NotImplementedError()
    overlapping_genes: list = []

    # BUG!!!!
    # This isn't currently being used to delete any genes, there is a bug!
    def recursively_find_gene(node) -> None:
        if node.type == 'gene':
            overlapping_features.add(node)
        else:
            for p in node.parents[0]:
                recursively_find_gene(p)
    for gene in overlapping_genes:
        gene.delete()

    trim_length: int = end - start + 2
    affected_features: list[Node] = [
        node for node in self.node_registry.values() if node.start > end]
    for feature in affected_features:
        feature.start -= trim_length
        feature.end -= trim_length

    if self.sequence is not None:
        seq: str = str(self.sequence)
        new_seq: str = seq[:start-1] + seq[end+1:]
        self.sequence = Seq(new_seq)

validate()

Validate the sequence region and all the feature nodes it contains. Checks for CDS feature overlaps.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the sequence region and all the feature nodes it contains.
    Checks for CDS feature overlaps."""
    for node in self.node_registry.values():
        if node.type == 'gene':
            node._validate()

    # Check for CDS overlaps
    CDSs: list = sorted(
        [
            entry for entry in self.node_registry.values()
            if entry.type == 'CDS'
        ],
        key=lambda CDS: CDS.start
    )
    for i, CDS1 in enumerate(CDSs):
        for CDS2 in CDSs[i+1:]:
            if (CDS1.end >= CDS2.start):
                CDS1.issues.append(CDSOverlap(CDS1, CDS2))
                CDS2.issues.append(CDSOverlap(CDS2, CDS1))

SnRNANode

Bases: Node

Node type describing a small nuclear RNA feature.

Source code in src/geffa/geffa.py
class SnRNANode(Node):
    """Node type describing a small nuclear RNA feature."""
    type: str = 'snRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the snRNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for snRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('snRNA parent needs to be gene')

validate()

Validate the snRNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the snRNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for snRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('snRNA parent needs to be gene')

SnoRNANode

Bases: Node

Node type describing a small nucleolar RNA feature.

Source code in src/geffa/geffa.py
class SnoRNANode(Node):
    """Node type describing a small nucleolar RNA feature."""
    type: str = 'snoRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the snoRNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for snoRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('snoRNA parent needs to be gene')

validate()

Validate the snoRNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the snoRNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for snoRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('snoRNA parent needs to be gene')

TRNANode

Bases: Node

Node type describing a t-RNA feature.

Source code in src/geffa/geffa.py
class TRNANode(Node):
    """Node type describing a t-RNA feature."""
    type: str = 'tRNA'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the t-RNA feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for tRNA.')

        for p in self.parents:
            if p.type != 'gene':
                raise ValueError('tRNA parent needs to be gene')

validate()

Validate the t-RNA feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the t-RNA feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for tRNA.')

    for p in self.parents:
        if p.type != 'gene':
            raise ValueError('tRNA parent needs to be gene')

ThreePrimeUTRNode

Bases: Node

Node type describing a 3' untranslated region (UTR).

Source code in src/geffa/geffa.py
class ThreePrimeUTRNode(Node):
    """Node type describing a 3' untranslated region (UTR)."""
    type: str = 'three_prime_UTR'
    toplevel: bool = False

    def validate(self) -> None:
        """Validate the 3'UTR feature."""
        if self.phase != '.':
            raise ValueError('Phase needs to be "." for three_prime_UTR.')

        for p in self.parents:
            if p.type != 'mRNA':
                raise ValueError('three_prime_UTR parent needs to be mRNA')

validate()

Validate the 3'UTR feature.

Source code in src/geffa/geffa.py
def validate(self) -> None:
    """Validate the 3'UTR feature."""
    if self.phase != '.':
        raise ValueError('Phase needs to be "." for three_prime_UTR.')

    for p in self.parents:
        if p.type != 'mRNA':
            raise ValueError('three_prime_UTR parent needs to be mRNA')