1 /************************************************************************
7 * This is an implemention of Unicode's Bidirectional Algorithm
10 * http://www.unicode.org/reports/tr9/
12 * Author: Ahmad Khalifa
15 * Revision Details: (Updated by Revision Control System)
21 * (www.arabeyes.org - under MIT license)
23 ************************************************************************/
28 * - Explicit marks need to be handled (they are not 100% now)
35 * Flips the text buffer, according to max level, and
39 * from: text buffer, on which to apply flipping
40 * level: resolved levels buffer
41 * max: the maximum level found in this line (should be unsigned char)
42 * count: line size in bidi_char
44 void flipThisRun(bidi_char
*from
, unsigned char *level
, int max
, int count
)
46 int i
, j
, rcount
, tlevel
;
50 while(i
<count
&& j
<count
)
53 /* find the start of the run of level=max */
55 i
= j
= findIndexOfRun(level
, i
, count
, max
);
56 /* find the end of the run */
57 while(i
<count
&& tlevel
<= level
[i
])
62 for(; rcount
>((i
-j
)/2); rcount
--)
64 temp
= from
[j
+rcount
-1];
65 from
[j
+rcount
-1] = from
[i
-rcount
];
66 from
[i
-rcount
] = temp
;
72 * Finds the index of a run with level equals tlevel
74 int findIndexOfRun(unsigned char* level
, int start
, int count
, int tlevel
)
77 for(i
=start
; i
<count
; i
++)
79 if(tlevel
== level
[i
])
88 * Returns character type of ch, by calling RLE table lookup
91 unsigned char getType(wchar_t ch
)
97 * The most significant 2 bits of each level are used to store
98 * Override status of each character
99 * This function sets the override bits of level according
100 * to the value in override, and reurns the new byte.
102 unsigned char setOverrideBits(unsigned char level
, unsigned char override
)
106 else if(override
== R
)
108 else if(override
== L
)
113 /* Dont remember what this was used for :-) */
114 unsigned char getPreviousLevel(unsigned char* level
, int from
)
116 unsigned char current
;
118 current
= level
[from
];
119 while(from
>0 && level
[from
] == current
)
123 return level
[++from
];
127 * Returns the first odd value greater than x
129 unsigned char leastGreaterOdd(unsigned char x
)
138 * Returns the first even value greater than x
140 unsigned char leastGreaterEven(unsigned char x
)
149 * Loops over the RLE_table array looking for the
152 unsigned char getRLE(wchar_t ch
)
157 for(i
=0; i
<0xFFFF; i
++)
159 freq
= ((RLENode
*)RLE_table
)[i
].f
;
162 return ((RLENode
*)RLE_table
)[i
].d
;
164 return ((RLENode
*)RLE_table
)[i
-1].d
;
166 /* this is here to stop compiler nagging */
170 /* The Main shaping function, and the only one to be used
171 * by the outside world.
173 * line: buffer to apply shaping to. this must be passed by doBidi() first
174 * to: output buffer for the shaped data
175 * count: number of characters in line
177 int do_shape(bidi_char
*line
, bidi_char
*to
, int count
)
179 int i
, tempShape
, ligFlag
;
181 for(ligFlag
=i
=0; i
<count
; i
++)
184 tempShape
= STYPE(line
[i
].wc
);
194 tempShape
= STYPE(line
[i
+1].wc
);
195 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
196 to
[i
].wc
= SFINAL((SISOLATED(line
[i
].wc
)));
198 to
[i
].wc
= SISOLATED(line
[i
].wc
);
204 tempShape
= STYPE(line
[i
+1].wc
);
205 if(line
[i
].wc
== 0x644)
211 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
218 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
225 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
232 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
246 if((tempShape
== SL
) || (tempShape
== SD
) || (tempShape
== SC
))
248 tempShape
= STYPE(line
[i
-1].wc
);
249 if((tempShape
== SR
) || (tempShape
== SD
) || (tempShape
== SC
))
250 to
[i
].wc
= SMEDIAL( (SISOLATED(line
[i
].wc
)) );
252 to
[i
].wc
= SFINAL((SISOLATED(line
[i
].wc
)));
256 tempShape
= STYPE(line
[i
-1].wc
);
257 if((tempShape
== SR
) || (tempShape
== SD
) || (tempShape
== SC
))
258 to
[i
].wc
= SINITIAL((SISOLATED(line
[i
].wc
)));
260 to
[i
].wc
= SISOLATED(line
[i
].wc
);
270 * The Main Bidi Function, and the only function that should
271 * be used by the outside world.
273 * line: a buffer of size count containing text to apply
274 * the Bidirectional algorithm to.
277 int do_bidi(bidi_char
*line
, int count
)
279 unsigned char* types
;
280 unsigned char* levels
;
281 unsigned char paragraphLevel
;
282 unsigned char currentEmbedding
;
283 unsigned char currentOverride
;
284 unsigned char tempType
;
285 int i
, j
, imax
, yes
, bover
;
287 /* Check the presence of R or AL types as optimization */
289 for(i
=0; i
<count
; i
++)
291 if(getType(line
[i
].wc
) == R
|| getType(line
[i
].wc
) == AL
)
300 /* Initialize types, levels */
301 types
= malloc(sizeof(unsigned char) * count
);
302 levels
= malloc(sizeof(unsigned char) * count
);
304 /* Rule (P1) NOT IMPLEMENTED
305 * P1. Split the text into separate paragraphs. A paragraph separator is
306 * kept with the previous paragraph. Within each paragraph, apply all the
307 * other rules of this algorithm.
311 * P2. In each paragraph, find the first character of type L, AL, or R.
312 * P3. If a character is found in P2 and it is of type AL or R, then set
313 * the paragraph embedding level to one; otherwise, set it to zero.
316 for( i
=0; i
<count
; i
++)
318 if(getType(line
[i
].wc
) == R
|| getType(line
[i
].wc
) == AL
)
323 else if(getType(line
[i
].wc
) == L
)
328 * X1. Begin by setting the current embedding level to the paragraph
329 * embedding level. Set the directional override status to neutral.
331 currentEmbedding
= paragraphLevel
;
332 currentOverride
= ON
;
334 /* Rule (X2), (X3), (X4), (X5), (X6), (X7), (X8)
335 * X2. With each RLE, compute the least greater odd embedding level.
336 * X3. With each LRE, compute the least greater even embedding level.
337 * X4. With each RLO, compute the least greater odd embedding level.
338 * X5. With each LRO, compute the least greater even embedding level.
339 * X6. For all types besides RLE, LRE, RLO, LRO, and PDF:
340 * a. Set the level of the current character to the current
342 * b. Whenever the directional override status is not neutral,
343 * reset the current character type to the directional
345 * X7. With each PDF, determine the matching embedding or override code.
346 * If there was a valid matching code, restore (pop) the last
347 * remembered (pushed) embedding level and directional override.
348 * X8. All explicit directional embeddings and overrides are completely
349 * terminated at the end of each paragraph. Paragraph separators are not
350 * included in the embedding. (Useless here) NOT IMPLEMENTED
353 for( i
=0; i
<count
; i
++)
355 tempType
= getType(line
[i
].wc
);
359 currentEmbedding
= levels
[i
] = leastGreaterOdd(currentEmbedding
);
360 levels
[i
] = setOverrideBits(levels
[i
], currentOverride
);
361 currentOverride
= ON
;
365 currentEmbedding
= levels
[i
] = leastGreaterEven(currentEmbedding
);
366 levels
[i
] = setOverrideBits(levels
[i
], currentOverride
);
367 currentOverride
= ON
;
371 currentEmbedding
= levels
[i
] = leastGreaterOdd(currentEmbedding
);
372 tempType
= currentOverride
= R
;
377 currentEmbedding
= levels
[i
] = leastGreaterEven(currentEmbedding
);
378 tempType
= currentOverride
= L
;
383 currentEmbedding
= getPreviousLevel(levels
, i
);
384 currentOverride
= currentEmbedding
& OMASK
;
385 currentEmbedding
= currentEmbedding
& ~OMASK
;
386 levels
[i
] = currentEmbedding
;
389 /* Whitespace is treated as neutral for now */
392 levels
[i
] = currentEmbedding
;
394 if(currentOverride
!= ON
)
395 tempType
= currentOverride
;
399 levels
[i
] = currentEmbedding
;
400 if(currentOverride
!= ON
)
401 tempType
= currentOverride
;
407 /* this clears out all overrides, so we can use levels safely... */
408 /* checks bover first */
410 for( i
=0; i
<count
; i
++)
411 levels
[i
] = levels
[i
] & LMASK
;
414 * X9. Remove all RLE, LRE, RLO, LRO, PDF, and BN codes.
415 * Here, they're converted to BN.
417 for(i
=0; i
<count
; i
++)
432 * W1. Examine each non-spacing mark (NSM) in the level run, and change
433 * the type of the NSM to the type of the previous character. If the NSM
434 * is at the start of the level run, it will get the type of sor.
437 types
[0] = paragraphLevel
;
439 for(i
=1; i
<count
; i
++)
442 types
[i
] = types
[i
-1];
443 /* Is this a safe assumption?
444 * I assumed the previous, IS a character.
449 * W2. Search backwards from each instance of a European number until the
450 * first strong type (R, L, AL, or sor) is found. If an AL is found,
451 * change the type of the European number to Arabic number.
453 for(i
=0; i
<count
; i
++)
464 }else if(types
[j
] == R
|| types
[j
] == L
)
474 * W3. Change all ALs to R.
476 * Optimization: on Rule Xn, we might set a flag on AL type
477 * to prevent this loop in L R lines only...
479 for(i
=0; i
<count
; i
++)
486 * W4. A single European separator between two European numbers changes
487 * to a European number. A single common separator between two numbers
488 * of the same type changes to that type.
490 for( i
=0; i
<(count
-1); i
++)
494 if(types
[i
-1] == EN
&& types
[i
+1] == EN
)
496 }else if(types
[i
] == CS
)
498 if(types
[i
-1] == EN
&& types
[i
+1] == EN
)
500 else if(types
[i
-1] == AN
&& types
[i
+1] == AN
)
506 * W5. A sequence of European terminators adjacent to European numbers
507 * changes to all European numbers.
509 * Optimization: lots here... else ifs need rearrangement
511 for(i
=0; i
<count
; i
++)
519 }else if(types
[i
+1] == EN
)
523 }else if(types
[i
+1] == ET
)
526 while(j
<count
&& types
[j
] == ET
)
537 * W6. Otherwise, separators and terminators change to Other Neutral:
539 for(i
=0; i
<count
; i
++)
552 * W7. Search backwards from each instance of a European number until
553 * the first strong type (R, L, or sor) is found. If an L is found,
554 * then change the type of the European number to L.
556 for(i
=0; i
<count
; i
++)
568 else if(types
[j
] == R
|| types
[j
] == AL
)
578 * N1. A sequence of neutrals takes the direction of the surrounding
579 * strong text if the text on both sides has the same direction. European
580 * and Arabic numbers are treated as though they were R.
584 if((types
[1] == R
) || (types
[1] == EN
) || (types
[1] == AN
))
586 else if(types
[1] == L
)
589 for(i
=1; i
<(count
-1); i
++)
596 while(j
<(count
-1) && types
[j
] == ON
)
609 }else if((types
[i
-1] == R
) ||
610 (types
[i
-1] == EN
) ||
614 while(j
<(count
-1) && types
[j
] == ON
)
618 if((types
[j
] == R
) ||
631 if(types
[count
-1] == ON
)
633 if(types
[count
-2] == R
|| types
[count
-2] == EN
|| types
[count
-2] == AN
)
635 else if(types
[count
-2] == L
)
640 * N2. Any remaining neutrals take the embedding direction.
642 for(i
=0; i
<count
; i
++)
646 if((levels
[i
] % 2) == 0)
654 * I1. For all characters with an even (left-to-right) embedding
655 * direction, those of type R go up one level and those of type AN or
656 * EN go up two levels.
658 for(i
=0; i
<count
; i
++)
660 if((levels
[i
] % 2) == 0)
664 else if(types
[i
] == AN
|| types
[i
] == EN
)
670 * I2. For all characters with an odd (right-to-left) embedding direction,
671 * those of type L, EN or AN go up one level.
673 for(i
=0; i
<count
; i
++)
675 if((levels
[i
] % 2) == 1)
677 if(types
[i
] == L
|| types
[i
] == EN
|| types
[i
] == AN
)
683 * L1. On each line, reset the embedding level of the following characters
684 * to the paragraph embedding level:
685 * (1)segment separators, (2)paragraph separators,
686 * (3)any sequence of whitespace characters preceding
687 * a segment separator or paragraph separator,
688 * (4)and any sequence of white space characters
689 * at the end of the line.
690 * The types of characters used here are the original types, not those
691 * modified by the previous phase.
694 while(j
>0 && (getType(line
[j
].wc
) == WS
))
700 for(j
++; j
<count
; j
++)
701 levels
[j
] = paragraphLevel
;
703 for(i
=0; i
<count
; i
++)
705 tempType
= getType(line
[i
].wc
);
709 while(j
<count
&& (getType(line
[j
].wc
) == WS
))
713 if(j
==count
|| getType(line
[j
].wc
) == B
||
714 getType(line
[j
].wc
) == S
)
718 levels
[j
] = paragraphLevel
;
721 }else if(tempType
== B
|| tempType
== S
)
722 levels
[i
] = paragraphLevel
;
725 /* Rule (L4) NOT IMPLEMENTED
726 * L4. A character that possesses the mirrored property as specified by
727 * Section 4.7, Mirrored, must be depicted by a mirrored glyph if the
728 * resolved directionality of that character is R.
730 /* Note: this is implemented before L2 for efficiency */
731 for(i
=0; i
<count
; i
++)
732 if((levels
[i
] % 2) == 1)
733 doMirror(&line
[i
].wc
);
736 * L2. From the highest level found in the text to the lowest odd level on
737 * each line, including intermediate levels not actually present in the
738 * text, reverse any contiguous sequence of characters that are at that
741 /* we flip the character string and leave the level array */
744 tempType
= levels
[0];
747 if(levels
[i
] > tempType
)
749 tempType
= levels
[i
];
754 /* maximum level in tempType, its index in imax. */
755 while(tempType
> 0) /* loop from highest level to the least odd, */
756 { /* which i assume is 1 */
757 flipThisRun(line
, levels
, tempType
, count
);
761 /* Rule (L3) NOT IMPLEMENTED
762 * L3. Combining marks applied to a right-to-left base character will at
763 * this point precede their base character. If the rendering engine
764 * expects them to follow the base characters in the final display
765 * process, then the ordering of the marks and the base character must
775 * Bad, Horrible funtion
776 * takes a pointer to a character that is checked for
777 * having a mirror glyph.
779 void doMirror(wchar_t* ch
)
781 if((*ch
& 0xFF00) == 0)
817 else if((*ch
& 0xFF00) == 0x2000)
847 else if((*ch
& 0xFF00) == 0x2200)
1194 }else if((*ch
& 0xFF00) == 0x2300)
1218 else if((*ch
& 0xFF00) == 0x2700)
1308 else if((*ch
& 0xFF00) == 0x2900)
1440 else if((*ch
& 0xFF00) == 0x2A00)
1686 else if((*ch
& 0xFF00) == 0x3000)
1746 else if((*ch
& 0xFF00) == 0xFF00)